Files
ucsinfer/notebooks/02_Gather Training Data.ipynb
2025-08-03 15:03:54 -07:00

4.0 KiB

In [3]:
import json

with open("ucs-community/json/en.json") as f:
    ucs = json.load(f)
    cat_ids = [x['CatID'] for x in ucs]
In [4]:
def ucs_catid(path: str) -> Optional[str]:
    import os.path
    'True if the file at `path` has a valid UCS filename'

    basename = os.path.basename(path)
    first_component = basename.split("_")[0]

    if first_component in cat_ids:
        return first_component
    else:
        return False
In [5]:
from typing import Optional

def description(path: str) -> Optional[str]:
    import json, subprocess
    result = subprocess.run(['ffprobe', '-show_format', '-of', 'json', path], capture_output=True)
    try:
        result.check_returncode()
    except:
        return None
        
    stream = json.loads(result.stdout)
    fmt = stream.get("format", None)
    if fmt:
        tags = fmt.get("tags", None)
        if tags:
            return tags.get("comment", None)
In [15]:
from typing import Optional, Tuple

def test_data_for_file(path: str) -> Optional[Tuple[str, str]]:
    'CatID and description if both are present'

    catid = ucs_catid(path)
    if catid is None:
        return None
        
    desc = description(path)

    if desc is not None:
        return (catid, desc)
    else:
        return None

def collect_dataset(scan_root: str, set_name: str):
    """
    Scans scan_root recursively and collects all catid/description pairs
    it can find.
    """
    import os, csv
    test_data = []
    for root, _, files in os.walk(scan_root):
        for file in files:
            if file.endswith(".wav") or file.endswith(".flac"):
                if test_datum := test_data_for_file(os.path.join(root,file)):
                    test_data += [test_datum]

    with open(set_name + '.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['Category', 'Description'])
        for row in test_data:
            writer.writerow(row)
    
In [17]:
collect_dataset("/Volumes/NAS SFX Library/JAMIELIB Libraries by Studio/_Designers/Jamie Hardt","jamie_files")
In [ ]: