ucsinfer/02_Gather Training Data.ipynb at 33619d2ae2b62561093213a88ce51a773efcf788

Files

Jamie Hardt 6c424099fd Committing module

2025-08-03 15:03:54 -07:00

4.0 KiB

Raw Blame History

In [3]:

import json

with open("ucs-community/json/en.json") as f:
    ucs = json.load(f)
    cat_ids = [x['CatID'] for x in ucs]

In [4]:

def ucs_catid(path: str) -> Optional[str]:
    import os.path
    'True if the file at `path` has a valid UCS filename'

    basename = os.path.basename(path)
    first_component = basename.split("_")[0]

    if first_component in cat_ids:
        return first_component
    else:
        return False

In [5]:

from typing import Optional

def description(path: str) -> Optional[str]:
    import json, subprocess
    result = subprocess.run(['ffprobe', '-show_format', '-of', 'json', path], capture_output=True)
    try:
        result.check_returncode()
    except:
        return None
        
    stream = json.loads(result.stdout)
    fmt = stream.get("format", None)
    if fmt:
        tags = fmt.get("tags", None)
        if tags:
            return tags.get("comment", None)

In [15]:

from typing import Optional, Tuple

def test_data_for_file(path: str) -> Optional[Tuple[str, str]]:
    'CatID and description if both are present'

    catid = ucs_catid(path)
    if catid is None:
        return None
        
    desc = description(path)

    if desc is not None:
        return (catid, desc)
    else:
        return None

def collect_dataset(scan_root: str, set_name: str):
    """
    Scans scan_root recursively and collects all catid/description pairs
    it can find.
    """
    import os, csv
    test_data = []
    for root, _, files in os.walk(scan_root):
        for file in files:
            if file.endswith(".wav") or file.endswith(".flac"):
                if test_datum := test_data_for_file(os.path.join(root,file)):
                    test_data += [test_datum]

    with open(set_name + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['Category', 'Description'])
        for row in test_data:
            writer.writerow(row)

In [17]:

collect_dataset("/Volumes/NAS SFX Library/JAMIELIB Libraries by Studio/_Designers/Jamie Hardt","jamie_files")

In [ ]:

4.0 KiB Raw Blame History

4.0 KiB

Raw Blame History