From 808389e8425c90dcad2d6b89e1bb3b98cde19f3c Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 29 Nov 2024 12:16:35 +0100 Subject: [PATCH 1/5] Fix finding cached files for flavor in load() --- audb/core/load.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/audb/core/load.py b/audb/core/load.py index 74ad3dc7..ccf5ce01 100644 --- a/audb/core/load.py +++ b/audb/core/load.py @@ -742,6 +742,9 @@ def _load_files( flavor, verbose, ) + print(f"{flavor=}") + print(f"{files[:3]=}") + print(f"{missing_files[:3]=}") if missing_files: if cached_versions is None: cached_versions = _cached_versions( @@ -862,6 +865,12 @@ def _missing_files( """ missing_files = [] + # Adjust expected file extensions, + # if a specific file format is requested. + # See https://github.com/audeering/audb/issues/324 + if files_type == "media" and flavor.format is not None: + files = audformat.utils.replace_file_extension(files, flavor.format) + for file in audeer.progress_bar( files, desc=f"Missing {files_type}", From d55cfc287ddf183b7c30116cd2855bcfd318baf3 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 29 Nov 2024 13:15:41 +0100 Subject: [PATCH 2/5] Add test for missing files --- tests/test_load.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/test_load.py b/tests/test_load.py index 112f2c5d..e294c85d 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -409,7 +409,7 @@ def test_load_from_cache(dbs): db = audb.load( DB_NAME, version="1.0.0", - format="flac", + format=format, full_path=False, num_workers=pytest.NUM_WORKERS, verbose=False, @@ -430,11 +430,26 @@ def test_load_from_cache(dbs): for file in db.files: assert os.path.exists(os.path.join(db_root, file)) + # Ensure no media files are marked as missing files, + # when requested in format different from original + # (https://github.com/audeering/audb/issues/324) + original_files = audformat.utils.replace_file_extension(db.files, "wav") + assert ( + audb.core.load._missing_files( + original_files, + "media", + db_root, + audb.Flavor(format=format), + False, + ) + == [] + ) + version = "2.0.0" db = audb.load( DB_NAME, version="2.0.0", - format="flac", + format=format, full_path=False, num_workers=pytest.NUM_WORKERS, verbose=False, From 0f3e74d79bfc3d0e51a77a85b4db31a5bb1dc279 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 29 Nov 2024 13:34:54 +0100 Subject: [PATCH 3/5] Fix implementation --- audb/core/load.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/audb/core/load.py b/audb/core/load.py index ccf5ce01..5b6cc54e 100644 --- a/audb/core/load.py +++ b/audb/core/load.py @@ -742,9 +742,6 @@ def _load_files( flavor, verbose, ) - print(f"{flavor=}") - print(f"{files[:3]=}") - print(f"{missing_files[:3]=}") if missing_files: if cached_versions is None: cached_versions = _cached_versions( @@ -865,25 +862,37 @@ def _missing_files( """ missing_files = [] - # Adjust expected file extensions, - # if a specific file format is requested. - # See https://github.com/audeering/audb/issues/324 - if files_type == "media" and flavor.format is not None: - files = audformat.utils.replace_file_extension(files, flavor.format) + if files_type == "table": + + def file_cached(file): + return os.path.exists( + os.path.join(db_root, f"db.{file}.csv") + ) or os.path.exists(os.path.join(db_root, f"db.{file}.parquet")) + + elif files_type == "media" and flavor.format is not None: + + def file_cached(file): + # Adjust expected file extensions, + # if a specific file format is requested. + # See https://github.com/audeering/audb/issues/324 + return os.path.exists( + os.path.join( + db_root, audeer.replace_file_extension(file, flavor.format) + ) + ) + + else: + + def file_cached(file): + return os.path.exists(os.path.join(db_root, file)) for file in audeer.progress_bar( files, desc=f"Missing {files_type}", disable=not verbose, ): - if files_type == "table": - if not os.path.exists( - os.path.join(db_root, f"db.{file}.csv") - ) and not os.path.exists(os.path.join(db_root, f"db.{file}.parquet")): - missing_files.append(file) - else: - if not os.path.exists(os.path.join(db_root, file)): - missing_files.append(file) + if not file_cached(file): + missing_files.append(file) return missing_files From c8ff650c541ac7261cf4716433c0bf65eb2714b1 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 29 Nov 2024 14:04:52 +0100 Subject: [PATCH 4/5] Improve test --- tests/test_load.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_load.py b/tests/test_load.py index e294c85d..3a3f6d61 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -409,7 +409,7 @@ def test_load_from_cache(dbs): db = audb.load( DB_NAME, version="1.0.0", - format=format, + format="flac", full_path=False, num_workers=pytest.NUM_WORKERS, verbose=False, @@ -430,8 +430,8 @@ def test_load_from_cache(dbs): for file in db.files: assert os.path.exists(os.path.join(db_root, file)) - # Ensure no media files are marked as missing files, - # when requested in format different from original + # Ensure no media files in flavor cache are marked as missing files, + # when flavor format is different from original format # (https://github.com/audeering/audb/issues/324) original_files = audformat.utils.replace_file_extension(db.files, "wav") assert ( @@ -439,7 +439,7 @@ def test_load_from_cache(dbs): original_files, "media", db_root, - audb.Flavor(format=format), + audb.Flavor(format="flac"), False, ) == [] @@ -449,7 +449,7 @@ def test_load_from_cache(dbs): db = audb.load( DB_NAME, version="2.0.0", - format=format, + format="flac", full_path=False, num_workers=pytest.NUM_WORKERS, verbose=False, From ac60f8a3a0212b00eb7cac351f7727412a31d4bd Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Fri, 29 Nov 2024 14:05:24 +0100 Subject: [PATCH 5/5] Improve code --- audb/core/load.py | 44 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/audb/core/load.py b/audb/core/load.py index 5b6cc54e..fbce72dc 100644 --- a/audb/core/load.py +++ b/audb/core/load.py @@ -860,41 +860,21 @@ def _missing_files( list of missing files or table IDs """ - missing_files = [] - - if files_type == "table": - - def file_cached(file): - return os.path.exists( - os.path.join(db_root, f"db.{file}.csv") - ) or os.path.exists(os.path.join(db_root, f"db.{file}.parquet")) - - elif files_type == "media" and flavor.format is not None: - - def file_cached(file): - # Adjust expected file extensions, - # if a specific file format is requested. - # See https://github.com/audeering/audb/issues/324 - return os.path.exists( - os.path.join( - db_root, audeer.replace_file_extension(file, flavor.format) - ) - ) - - else: - def file_cached(file): + def is_cached(file): + if files_type == "table": + path1 = os.path.join(db_root, f"db.{file}.csv") + path2 = os.path.join(db_root, f"db.{file}.parquet") + return os.path.exists(path1) or os.path.exists(path2) + elif files_type == "media" and flavor.format is not None: + # https://github.com/audeering/audb/issues/324 + cached_file = audeer.replace_file_extension(file, flavor.format) + return os.path.exists(os.path.join(db_root, cached_file)) + else: return os.path.exists(os.path.join(db_root, file)) - for file in audeer.progress_bar( - files, - desc=f"Missing {files_type}", - disable=not verbose, - ): - if not file_cached(file): - missing_files.append(file) - - return missing_files + pbar = audeer.progress_bar(files, desc=f"Missing {files_type}", disable=not verbose) + return [file for file in pbar if not is_cached(file)] def _remove_media(