Compare commits
	
		
			2 Commits
		
	
	
		
			63b140209b
			...
			615d8ab279
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 615d8ab279 | |||
| 9a887c4ed5 | 
| @@ -1,7 +1,6 @@ | |||||||
| import os | import os | ||||||
| # import csv | # import csv | ||||||
| import logging | import logging | ||||||
| from subprocess import CalledProcessError |  | ||||||
| from itertools import chain | from itertools import chain | ||||||
|  |  | ||||||
| import tqdm | import tqdm | ||||||
| @@ -9,7 +8,8 @@ import click | |||||||
| # from tabulate import tabulate, SEPARATING_LINE | # from tabulate import tabulate, SEPARATING_LINE | ||||||
|  |  | ||||||
| from .inference import InferenceContext, load_ucs | from .inference import InferenceContext, load_ucs | ||||||
| from .gather import build_sentence_class_dataset, print_dataset_stats  | from .gather import (build_sentence_class_dataset, print_dataset_stats,  | ||||||
|  |                      ucs_definitions_generator, scan_metadata, walk_path) | ||||||
| from .recommend import print_recommendation | from .recommend import print_recommendation | ||||||
| from .util import ffmpeg_description, parse_ucs | from .util import ffmpeg_description, parse_ucs | ||||||
|  |  | ||||||
| @@ -157,67 +157,30 @@ def gather(ctx, paths, out, ucs_data): | |||||||
|     """ |     """ | ||||||
|     logger.debug("GATHER mode") |     logger.debug("GATHER mode") | ||||||
|  |  | ||||||
|     types = ['.wav', '.flac'] |  | ||||||
|  |  | ||||||
|     logger.debug(f"Loading category list...") |     logger.debug(f"Loading category list...") | ||||||
|     ucs = load_ucs(full_ucs=ctx.obj['complete_ucs']) |     ucs = load_ucs(full_ucs=ctx.obj['complete_ucs']) | ||||||
|  |  | ||||||
|     scan_list = [] |     scan_list: list[tuple[str,str]] = [] | ||||||
|     catid_list = [cat.catid for cat in ucs] |     catid_list = [cat.catid for cat in ucs] | ||||||
|  |  | ||||||
|     if ucs_data: |     if ucs_data: | ||||||
|         logger.info('Creating dataset for UCS categories instead of from PATH') |         logger.info('Creating dataset for UCS categories instead of from PATH') | ||||||
|         paths = [] |         paths = [] | ||||||
|  |  | ||||||
|     walker_p = tqdm.tqdm(total=None, unit='dir', desc="Walking filesystem...") |  | ||||||
|     for path in paths: |     for path in paths: | ||||||
|         for dirpath, _, filenames in os.walk(path): |         scan_list += walk_path(path, catid_list) | ||||||
|             logger.info(f"Walking directory {dirpath}") |  | ||||||
|             for filename in filenames: |  | ||||||
|                 walker_p.update() |  | ||||||
|                 root, ext = os.path.splitext(filename) |  | ||||||
|                 if ext not in types or filename.startswith("._"): |  | ||||||
|                     continue  |  | ||||||
|  |  | ||||||
|                 if (ucs_components := parse_ucs(root, catid_list)): |  | ||||||
|                     p = os.path.join(dirpath, filename) |  | ||||||
|                     logger.info(f"Adding path to scan list {p}") |  | ||||||
|                     scan_list.append((ucs_components.cat_id, p)) |  | ||||||
|     walker_p.close() |  | ||||||
|  |  | ||||||
|     logger.info(f"Found {len(scan_list)} files to process.") |     logger.info(f"Found {len(scan_list)} files to process.") | ||||||
|  |  | ||||||
|     def scan_metadata(): |  | ||||||
|         for pair in tqdm.tqdm(scan_list, unit='files'): |  | ||||||
|             logger.info(f"Scanning file with ffprobe: {pair[1]}") |  | ||||||
|             try: |  | ||||||
|                 desc = ffmpeg_description(pair[1]) |  | ||||||
|             except CalledProcessError as e: |  | ||||||
|                 logger.error(f"ffprobe returned error (){e.returncode}): " \ |  | ||||||
|                         + e.stderr) |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             if desc: |  | ||||||
|                 yield desc, str(pair[0]) |  | ||||||
|             else: |  | ||||||
|                 comps = parse_ucs(os.path.basename(pair[1]), catid_list) |  | ||||||
|                 assert comps |  | ||||||
|                 yield comps.fx_name, str(pair[0]) |  | ||||||
|  |  | ||||||
|     def ucs_metadata(): |  | ||||||
|         for cat in ucs: |  | ||||||
|             yield cat.explanations, cat.catid |  | ||||||
|             yield ", ".join(cat.synonymns), cat.catid |  | ||||||
|  |  | ||||||
|     logger.info("Building dataset...") |     logger.info("Building dataset...") | ||||||
|  |  | ||||||
|     dataset = build_sentence_class_dataset(chain(scan_metadata(),  |     dataset = build_sentence_class_dataset( | ||||||
|                                                  ucs_metadata()), |             chain(scan_metadata(scan_list, catid_list),                                | ||||||
|  |                   ucs_definitions_generator(ucs)),                               | ||||||
|             catid_list) |             catid_list) | ||||||
|   |   | ||||||
|      |  | ||||||
|     logger.info(f"Saving dataset to disk at {out}") |     logger.info(f"Saving dataset to disk at {out}") | ||||||
|     print_dataset_stats(dataset) |     print_dataset_stats(dataset, catid_list) | ||||||
|     dataset.save_to_disk(out) |     dataset.save_to_disk(out) | ||||||
|      |      | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,9 +1,63 @@ | |||||||
|  | from .inference import Ucs | ||||||
|  | from .util import ffmpeg_description, parse_ucs | ||||||
|  |  | ||||||
|  | from subprocess import CalledProcessError | ||||||
|  | import os.path | ||||||
|  |  | ||||||
| from datasets import Dataset, Features, Value, ClassLabel, DatasetInfo | from datasets import Dataset, Features, Value, ClassLabel, DatasetInfo | ||||||
| from datasets.dataset_dict import DatasetDict | from datasets.dataset_dict import DatasetDict | ||||||
|  |  | ||||||
| from typing import Iterator | from typing import Iterator, Generator | ||||||
|  |  | ||||||
| from tabulate import tabulate | from tabulate import tabulate | ||||||
|  | import logging | ||||||
|  | import tqdm | ||||||
|  |  | ||||||
|  | def walk_path(path:str, catid_list) -> list[tuple[str,str]]: | ||||||
|  |     types = ['.wav', '.flac'] | ||||||
|  |     logger = logging.getLogger('ucsinfer') | ||||||
|  |     walker_p = tqdm.tqdm(total=None, unit='dir', desc="Walking filesystem...") | ||||||
|  |     scan_list = [] | ||||||
|  |  | ||||||
|  |     for dirpath, _, filenames in os.walk(path): | ||||||
|  |         logger.info(f"Walking directory {dirpath}") | ||||||
|  |         for filename in filenames: | ||||||
|  |             walker_p.update() | ||||||
|  |             root, ext = os.path.splitext(filename) | ||||||
|  |             if ext not in types or filename.startswith("._"): | ||||||
|  |                 continue  | ||||||
|  |  | ||||||
|  |             if (ucs_components := parse_ucs(root, catid_list)): | ||||||
|  |                 p = os.path.join(dirpath, filename) | ||||||
|  |                 logger.info(f"Adding path to scan list {p}") | ||||||
|  |                 scan_list.append((ucs_components.cat_id, p)) | ||||||
|  |  | ||||||
|  |     return scan_list | ||||||
|  |  | ||||||
|  | def scan_metadata(scan_list: list[tuple[str,str]], catid_list: list[str]): | ||||||
|  |     logger = logging.getLogger('ucsinfer') | ||||||
|  |     for pair in tqdm.tqdm(scan_list, unit='files'): | ||||||
|  |         logger.info(f"Scanning file with ffprobe: {pair[1]}") | ||||||
|  |         try: | ||||||
|  |             desc = ffmpeg_description(pair[1]) | ||||||
|  |         except CalledProcessError as e: | ||||||
|  |             logger.error(f"ffprobe returned error (){e.returncode}): " \ | ||||||
|  |                     + e.stderr) | ||||||
|  |             continue | ||||||
|  |  | ||||||
|  |         if desc: | ||||||
|  |             yield desc, str(pair[0]) | ||||||
|  |         else: | ||||||
|  |             comps = parse_ucs(os.path.basename(pair[1]), catid_list) | ||||||
|  |             assert comps | ||||||
|  |             yield comps.fx_name, str(pair[0]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def ucs_definitions_generator(ucs: list[Ucs]) \ | ||||||
|  |         -> Generator[tuple[str,str],None, None]: | ||||||
|  |     for cat in ucs: | ||||||
|  |         yield cat.explanations, cat.catid | ||||||
|  |         yield ", ".join(cat.synonymns), cat.catid | ||||||
|  |  | ||||||
| def print_dataset_stats(dataset: DatasetDict, catlist: list[str]): | def print_dataset_stats(dataset: DatasetDict, catlist: list[str]): | ||||||
|      |      | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user