Refactoring of gather

This commit is contained in:
2025-09-10 23:49:33 -07:00
parent 9a887c4ed5
commit 615d8ab279
2 changed files with 23 additions and 17 deletions

View File

@@ -9,7 +9,7 @@ import click
from .inference import InferenceContext, load_ucs from .inference import InferenceContext, load_ucs
from .gather import (build_sentence_class_dataset, print_dataset_stats, from .gather import (build_sentence_class_dataset, print_dataset_stats,
ucs_definitions_generator, scan_metadata) ucs_definitions_generator, scan_metadata, walk_path)
from .recommend import print_recommendation from .recommend import print_recommendation
from .util import ffmpeg_description, parse_ucs from .util import ffmpeg_description, parse_ucs
@@ -157,8 +157,6 @@ def gather(ctx, paths, out, ucs_data):
""" """
logger.debug("GATHER mode") logger.debug("GATHER mode")
types = ['.wav', '.flac']
logger.debug(f"Loading category list...") logger.debug(f"Loading category list...")
ucs = load_ucs(full_ucs=ctx.obj['complete_ucs']) ucs = load_ucs(full_ucs=ctx.obj['complete_ucs'])
@@ -169,22 +167,9 @@ def gather(ctx, paths, out, ucs_data):
logger.info('Creating dataset for UCS categories instead of from PATH') logger.info('Creating dataset for UCS categories instead of from PATH')
paths = [] paths = []
walker_p = tqdm.tqdm(total=None, unit='dir', desc="Walking filesystem...")
for path in paths: for path in paths:
for dirpath, _, filenames in os.walk(path): scan_list += walk_path(path, catid_list)
logger.info(f"Walking directory {dirpath}")
for filename in filenames:
walker_p.update()
root, ext = os.path.splitext(filename)
if ext not in types or filename.startswith("._"):
continue
if (ucs_components := parse_ucs(root, catid_list)):
p = os.path.join(dirpath, filename)
logger.info(f"Adding path to scan list {p}")
scan_list.append((ucs_components.cat_id, p))
walker_p.close()
logger.info(f"Found {len(scan_list)} files to process.") logger.info(f"Found {len(scan_list)} files to process.")
logger.info("Building dataset...") logger.info("Building dataset...")

View File

@@ -13,6 +13,27 @@ from tabulate import tabulate
import logging import logging
import tqdm import tqdm
def walk_path(path:str, catid_list) -> list[tuple[str,str]]:
types = ['.wav', '.flac']
logger = logging.getLogger('ucsinfer')
walker_p = tqdm.tqdm(total=None, unit='dir', desc="Walking filesystem...")
scan_list = []
for dirpath, _, filenames in os.walk(path):
logger.info(f"Walking directory {dirpath}")
for filename in filenames:
walker_p.update()
root, ext = os.path.splitext(filename)
if ext not in types or filename.startswith("._"):
continue
if (ucs_components := parse_ucs(root, catid_list)):
p = os.path.join(dirpath, filename)
logger.info(f"Adding path to scan list {p}")
scan_list.append((ucs_components.cat_id, p))
return scan_list
def scan_metadata(scan_list: list[tuple[str,str]], catid_list: list[str]): def scan_metadata(scan_list: list[tuple[str,str]], catid_list: list[str]):
logger = logging.getLogger('ucsinfer') logger = logging.getLogger('ucsinfer')
for pair in tqdm.tqdm(scan_list, unit='files'): for pair in tqdm.tqdm(scan_list, unit='files'):