Compare commits
2 Commits
63b140209b
...
615d8ab279
Author | SHA1 | Date | |
---|---|---|---|
615d8ab279 | |||
9a887c4ed5 |
@@ -1,7 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
# import csv
|
# import csv
|
||||||
import logging
|
import logging
|
||||||
from subprocess import CalledProcessError
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
import tqdm
|
import tqdm
|
||||||
@@ -9,7 +8,8 @@ import click
|
|||||||
# from tabulate import tabulate, SEPARATING_LINE
|
# from tabulate import tabulate, SEPARATING_LINE
|
||||||
|
|
||||||
from .inference import InferenceContext, load_ucs
|
from .inference import InferenceContext, load_ucs
|
||||||
from .gather import build_sentence_class_dataset, print_dataset_stats
|
from .gather import (build_sentence_class_dataset, print_dataset_stats,
|
||||||
|
ucs_definitions_generator, scan_metadata, walk_path)
|
||||||
from .recommend import print_recommendation
|
from .recommend import print_recommendation
|
||||||
from .util import ffmpeg_description, parse_ucs
|
from .util import ffmpeg_description, parse_ucs
|
||||||
|
|
||||||
@@ -157,67 +157,30 @@ def gather(ctx, paths, out, ucs_data):
|
|||||||
"""
|
"""
|
||||||
logger.debug("GATHER mode")
|
logger.debug("GATHER mode")
|
||||||
|
|
||||||
types = ['.wav', '.flac']
|
|
||||||
|
|
||||||
logger.debug(f"Loading category list...")
|
logger.debug(f"Loading category list...")
|
||||||
ucs = load_ucs(full_ucs=ctx.obj['complete_ucs'])
|
ucs = load_ucs(full_ucs=ctx.obj['complete_ucs'])
|
||||||
|
|
||||||
scan_list = []
|
scan_list: list[tuple[str,str]] = []
|
||||||
catid_list = [cat.catid for cat in ucs]
|
catid_list = [cat.catid for cat in ucs]
|
||||||
|
|
||||||
if ucs_data:
|
if ucs_data:
|
||||||
logger.info('Creating dataset for UCS categories instead of from PATH')
|
logger.info('Creating dataset for UCS categories instead of from PATH')
|
||||||
paths = []
|
paths = []
|
||||||
|
|
||||||
walker_p = tqdm.tqdm(total=None, unit='dir', desc="Walking filesystem...")
|
|
||||||
for path in paths:
|
for path in paths:
|
||||||
for dirpath, _, filenames in os.walk(path):
|
scan_list += walk_path(path, catid_list)
|
||||||
logger.info(f"Walking directory {dirpath}")
|
|
||||||
for filename in filenames:
|
|
||||||
walker_p.update()
|
|
||||||
root, ext = os.path.splitext(filename)
|
|
||||||
if ext not in types or filename.startswith("._"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if (ucs_components := parse_ucs(root, catid_list)):
|
|
||||||
p = os.path.join(dirpath, filename)
|
|
||||||
logger.info(f"Adding path to scan list {p}")
|
|
||||||
scan_list.append((ucs_components.cat_id, p))
|
|
||||||
walker_p.close()
|
|
||||||
|
|
||||||
logger.info(f"Found {len(scan_list)} files to process.")
|
logger.info(f"Found {len(scan_list)} files to process.")
|
||||||
|
|
||||||
def scan_metadata():
|
|
||||||
for pair in tqdm.tqdm(scan_list, unit='files'):
|
|
||||||
logger.info(f"Scanning file with ffprobe: {pair[1]}")
|
|
||||||
try:
|
|
||||||
desc = ffmpeg_description(pair[1])
|
|
||||||
except CalledProcessError as e:
|
|
||||||
logger.error(f"ffprobe returned error (){e.returncode}): " \
|
|
||||||
+ e.stderr)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if desc:
|
|
||||||
yield desc, str(pair[0])
|
|
||||||
else:
|
|
||||||
comps = parse_ucs(os.path.basename(pair[1]), catid_list)
|
|
||||||
assert comps
|
|
||||||
yield comps.fx_name, str(pair[0])
|
|
||||||
|
|
||||||
def ucs_metadata():
|
|
||||||
for cat in ucs:
|
|
||||||
yield cat.explanations, cat.catid
|
|
||||||
yield ", ".join(cat.synonymns), cat.catid
|
|
||||||
|
|
||||||
logger.info("Building dataset...")
|
logger.info("Building dataset...")
|
||||||
|
|
||||||
dataset = build_sentence_class_dataset(chain(scan_metadata(),
|
dataset = build_sentence_class_dataset(
|
||||||
ucs_metadata()),
|
chain(scan_metadata(scan_list, catid_list),
|
||||||
|
ucs_definitions_generator(ucs)),
|
||||||
catid_list)
|
catid_list)
|
||||||
|
|
||||||
|
|
||||||
logger.info(f"Saving dataset to disk at {out}")
|
logger.info(f"Saving dataset to disk at {out}")
|
||||||
print_dataset_stats(dataset)
|
print_dataset_stats(dataset, catid_list)
|
||||||
dataset.save_to_disk(out)
|
dataset.save_to_disk(out)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -1,9 +1,63 @@
|
|||||||
|
from .inference import Ucs
|
||||||
|
from .util import ffmpeg_description, parse_ucs
|
||||||
|
|
||||||
|
from subprocess import CalledProcessError
|
||||||
|
import os.path
|
||||||
|
|
||||||
from datasets import Dataset, Features, Value, ClassLabel, DatasetInfo
|
from datasets import Dataset, Features, Value, ClassLabel, DatasetInfo
|
||||||
from datasets.dataset_dict import DatasetDict
|
from datasets.dataset_dict import DatasetDict
|
||||||
|
|
||||||
from typing import Iterator
|
from typing import Iterator, Generator
|
||||||
|
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
import logging
|
||||||
|
import tqdm
|
||||||
|
|
||||||
|
def walk_path(path:str, catid_list) -> list[tuple[str,str]]:
|
||||||
|
types = ['.wav', '.flac']
|
||||||
|
logger = logging.getLogger('ucsinfer')
|
||||||
|
walker_p = tqdm.tqdm(total=None, unit='dir', desc="Walking filesystem...")
|
||||||
|
scan_list = []
|
||||||
|
|
||||||
|
for dirpath, _, filenames in os.walk(path):
|
||||||
|
logger.info(f"Walking directory {dirpath}")
|
||||||
|
for filename in filenames:
|
||||||
|
walker_p.update()
|
||||||
|
root, ext = os.path.splitext(filename)
|
||||||
|
if ext not in types or filename.startswith("._"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if (ucs_components := parse_ucs(root, catid_list)):
|
||||||
|
p = os.path.join(dirpath, filename)
|
||||||
|
logger.info(f"Adding path to scan list {p}")
|
||||||
|
scan_list.append((ucs_components.cat_id, p))
|
||||||
|
|
||||||
|
return scan_list
|
||||||
|
|
||||||
|
def scan_metadata(scan_list: list[tuple[str,str]], catid_list: list[str]):
|
||||||
|
logger = logging.getLogger('ucsinfer')
|
||||||
|
for pair in tqdm.tqdm(scan_list, unit='files'):
|
||||||
|
logger.info(f"Scanning file with ffprobe: {pair[1]}")
|
||||||
|
try:
|
||||||
|
desc = ffmpeg_description(pair[1])
|
||||||
|
except CalledProcessError as e:
|
||||||
|
logger.error(f"ffprobe returned error (){e.returncode}): " \
|
||||||
|
+ e.stderr)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if desc:
|
||||||
|
yield desc, str(pair[0])
|
||||||
|
else:
|
||||||
|
comps = parse_ucs(os.path.basename(pair[1]), catid_list)
|
||||||
|
assert comps
|
||||||
|
yield comps.fx_name, str(pair[0])
|
||||||
|
|
||||||
|
|
||||||
|
def ucs_definitions_generator(ucs: list[Ucs]) \
|
||||||
|
-> Generator[tuple[str,str],None, None]:
|
||||||
|
for cat in ucs:
|
||||||
|
yield cat.explanations, cat.catid
|
||||||
|
yield ", ".join(cat.synonymns), cat.catid
|
||||||
|
|
||||||
def print_dataset_stats(dataset: DatasetDict, catlist: list[str]):
|
def print_dataset_stats(dataset: DatasetDict, catlist: list[str]):
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user