Refactoring of gather

Command-line entry point:wq
2025-09-10 23:49:33 -07:00 · 2025-09-10 23:38:33 -07:00 · 2025-09-10 22:23:57 -07:00 · 2025-09-10 22:08:16 -07:00 · 2025-09-10 22:07:53 -07:00 · 2025-09-06 14:10:53 -07:00
7 changed files with 152 additions and 161 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -17,8 +17,9 @@
 - Print more information about the dataset coverage of UCS
  - Allow skipping model testing for this
 - Print raw output
- Maybe load everything into a sqlite for slicker reporting
+<!-- - Maybe load everything into a sqlite for slicker reporting -->
 ## Utility
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,3 +26,6 @@ build-backend = "poetry.core.masonry.api"
 ipython = "^9.4.0"
 jupyter = "^1.1.1"
 [tool.poetry.scripts]
 ucsinfer = "ucsinfer.__main__:ucsinfer"
--- a/ucsinfer/main.py
+++ b/ucsinfer/main.py
@@ -1,14 +1,15 @@
 import os
 # import csv
 import logging
-from subprocess import CalledProcessError
+from itertools import chain
 import tqdm
 import click
 # from tabulate import tabulate, SEPARATING_LINE
 from .inference import InferenceContext, load_ucs
-from .gather import build_sentence_class_dataset 
+from .gather import (build_sentence_class_dataset, print_dataset_stats, 
                     ucs_definitions_generator, scan_metadata, walk_path)
 from .recommend import print_recommendation
 from .util import ffmpeg_description, parse_ucs
@@ -102,6 +103,11 @@ def recommend(ctx, text, paths, interactive, skip_ucs):
    catlist = [x.catid for x in inference_ctx.catlist]
    for path in paths:
        _, ext = os.path.splitext(path) 
        if ext not in (".wav", ".flac"):
            continue
        basename = os.path.basename(path)
        if skip_ucs and parse_ucs(basename, catlist):
            continue 
@@ -151,12 +157,10 @@ def gather(ctx, paths, out, ucs_data):
    """
    logger.debug("GATHER mode")
    types = ['.wav', '.flac']
    logger.debug(f"Loading category list...")
    ucs = load_ucs(full_ucs=ctx.obj['complete_ucs'])
-    scan_list = []
+    scan_list: list[tuple[str,str]] = []
    catid_list = [cat.catid for cat in ucs]
    if ucs_data:
@@ -164,50 +168,19 @@ def gather(ctx, paths, out, ucs_data):
        paths = []
    for path in paths:
-        for dirpath, _, filenames in os.walk(path):
+        scan_list += walk_path(path, catid_list)
            logger.info(f"Walking directory {dirpath}")
            for filename in filenames:
                root, ext = os.path.splitext(filename)
                if ext not in types or filename.startswith("._"):
                    continue 
                if (ucs_components := parse_ucs(root, catid_list)):
                    p = os.path.join(dirpath, filename)
                    logger.info(f"Adding path to scan list {p}")
                    scan_list.append((ucs_components.cat_id, p))
    logger.info(f"Found {len(scan_list)} files to process.")
    def scan_metadata():
        for pair in tqdm.tqdm(scan_list, unit='files'):
            logger.info(f"Scanning file with ffprobe: {pair[1]}")
            try:
                desc = ffmpeg_description(pair[1])
            except CalledProcessError as e:
                logger.error(f"ffprobe returned error {e.returncode}: " \
                        + e.stderr)
                continue
            if desc:
                yield desc, str(pair[0])
            else:
                comps = parse_ucs(os.path.basename(pair[1]), catid_list)
                assert comps
                yield comps.fx_name, str(pair[0])
    def ucs_metadata():
        for cat in ucs:
            yield cat.explanations, cat.catid
            yield ", ".join(cat.synonymns), cat.catid
    logger.info("Building dataset...")
-    if ucs_data:
+
-        dataset = build_sentence_class_dataset(ucs_metadata(), catid_list)
+    dataset = build_sentence_class_dataset(
-    else:
+            chain(scan_metadata(scan_list, catid_list),                               
-        dataset = build_sentence_class_dataset(scan_metadata(), catid_list)
+                  ucs_definitions_generator(ucs)),                              
            catid_list)
    logger.info(f"Saving dataset to disk at {out}")
    print_dataset_stats(dataset, catid_list)
    dataset.save_to_disk(out)
@@ -222,114 +195,15 @@ def finetune(ctx):
@ucsinfer.command('evaluate')
-@click.option('--offset', type=int, default=0, metavar="<int>", 
+@click.argument('dataset', default='dataset/')
              help='Skip this many records in the dataset before processing')
@click.option('--limit', type=int, default=-1, metavar="<int>",
              help='Process this many records and then exit')
@click.argument('dataset', type=click.File('r', encoding='utf8'),
                default='dataset.csv')
@click.pass_context
-def evaluate(ctx, dataset, offset, limit, no_foley):
+def evaluate(ctx, dataset, offset, limit):
    """
    Use datasets to evaluate model performance 
    The `evaluate` command reads the input DATASET file row by row and 
    performs a classifcation of the given description against the selected 
    model (either the default or using the --model option). The command then 
    checks if the model inferred the correct category as given by the dataset.
    The model gives its top 10 possible categories for a given description, 
    and the results are tabulated according to (1) wether the top
    classification was correct, (2) wether the correct classifcation was in the
    top 5, or (3) wether it was in the top 10. The worst-performing category,
    the one with the most misses, is also reported as well as the category
    coverage, how many categories are present in the dataset.
    NOTE: With experimentation it was found that foley items generally were 
    classified according to their subject and not wether or not they were
    foley, and so these categories can be excluded with the --no-foley option.
    """
    logger.debug("EVALUATE mode")
    logger.warning("Model evaluation is not currently implemented")
-    # inference_context = InferenceContext(
+    
    #         ctx.obj['model_name'], use_cached_model=ctx.obj['model_cache'],
    #         use_full_ucs=ctx.obj['complete_ucs'])
    #
    # reader = csv.reader(dataset)
    #
    # results = []
    # 
    # if offset > 0:
    #     logger.debug(f"Skipping {offset} records...")
    #
    # if limit > 0:
    #     logger.debug(f"Will only evaluate {limit} records...")
    #
    # progress_bar = tqdm.tqdm(total=limit, 
    #                          desc="Processing dataset...",
    #                          unit="rec")
    # for i, row in enumerate(reader):
    #     if i < offset:
    #         continue
    #
    #     if limit > 0 and i >= limit + offset:
    #         break
    #
    #     cat_id, description = row
    #     if no_foley and cat_id in ['FOLYProp', 'FOLYFeet']:
    #         continue 
    #
    #     guesses = inference_context.classify_text_ranked(description, limit=10)
    #     if cat_id == guesses[0]:
    #         results.append({'catid': cat_id, 'result': "TOP"})
    #     elif cat_id in guesses[0:5]:
    #         results.append({'catid': cat_id, 'result': "TOP_5"})
    #     elif cat_id in guesses:
    #         results.append({'catid': cat_id, 'result': "TOP_10"})
    #     else:
    #         results.append({'catid': cat_id, 'result': "MISS"})
    #     
    #     progress_bar.update(1)
    #
    # total = len(results)
    # total_top = len([x for x in results if x['result'] == 'TOP'])
    # total_top_5 = len([x for x in results if x['result'] == 'TOP_5'])
    # total_top_10 = len([x for x in results if x['result'] == 'TOP_10'])
    #
    # cats = set([x['catid'] for x in results])
    # total_cats = len(cats)
    #
    # miss_counts = []
    # for cat in cats:
    #     miss_counts.append(
    #         (cat, len([x for x in results
    #                    if x['catid'] == cat and x['result'] == 'MISS'])))
    #
    # miss_counts = sorted(miss_counts, key=lambda x: x[1])
    #
    # print(f"## Results for Model {model} ##\n")
    #
    # if no_foley:
    #     print("(FOLYProp and FOLYFeet have been omitted from the dataset.)\n")
    #
    # table = [
    #     ["Total records in sample:", f"{total}"],
    #     ["Top Result:", f"{total_top}",
    #      f"{float(total_top)/float(total):.2%}"],
    #     ["Top 5 Result:", f"{total_top_5}",
    #      f"{float(total_top_5)/float(total):.2%}"],
    #     ["Top 10 Result:", f"{total_top_10}",
    #      f"{float(total_top_10)/float(total):.2%}"],
    #     SEPARATING_LINE,
    #     ["UCS category count:", f"{len(inference_context.catlist)}"],
    #     ["Total categories in sample:", f"{total_cats}",
    #      f"{float(total_cats)/float(len(inference_context.catlist)):.2%}"],
    #     [f"Most missed category ({miss_counts[-1][0]}):",
    #      f"{miss_counts[-1][1]}",
    #      f"{float(miss_counts[-1][1])/float(total):.2%}"]
    # ]
    #
    # print(tabulate(table, headers=['', 'n', 'pct'], tablefmt='github'))
 if __name__ == '__main__':
--- a/ucsinfer/evaluate.py
+++ b/ucsinfer/evaluate.py
@@ -0,0 +1,7 @@
 # from sentence_transformers import SentenceTransformer
 # from sentence_transformers.evaluation import BinaryClassificationEvaluator
 # from datasets import load_dataset_from_disk, DatasetDict
 #
--- a/ucsinfer/gather.py
+++ b/ucsinfer/gather.py
@@ -1,13 +1,82 @@
-from datasets import Dataset, Features, Value, ClassLabel, DatasetInfo
+from .inference import Ucs
 from .util import ffmpeg_description, parse_ucs
-from typing import Generator, Any
+from subprocess import CalledProcessError
 import os.path
 from datasets import Dataset, Features, Value, ClassLabel, DatasetInfo
 from datasets.dataset_dict import DatasetDict
 from typing import Iterator, Generator
 from tabulate import tabulate
 import logging
 import tqdm
 def walk_path(path:str, catid_list) -> list[tuple[str,str]]:
    types = ['.wav', '.flac']
    logger = logging.getLogger('ucsinfer')
    walker_p = tqdm.tqdm(total=None, unit='dir', desc="Walking filesystem...")
    scan_list = []
    for dirpath, _, filenames in os.walk(path):
        logger.info(f"Walking directory {dirpath}")
        for filename in filenames:
            walker_p.update()
            root, ext = os.path.splitext(filename)
            if ext not in types or filename.startswith("._"):
                continue 
            if (ucs_components := parse_ucs(root, catid_list)):
                p = os.path.join(dirpath, filename)
                logger.info(f"Adding path to scan list {p}")
                scan_list.append((ucs_components.cat_id, p))
    return scan_list
 def scan_metadata(scan_list: list[tuple[str,str]], catid_list: list[str]):
    logger = logging.getLogger('ucsinfer')
    for pair in tqdm.tqdm(scan_list, unit='files'):
        logger.info(f"Scanning file with ffprobe: {pair[1]}")
        try:
            desc = ffmpeg_description(pair[1])
        except CalledProcessError as e:
            logger.error(f"ffprobe returned error (){e.returncode}): " \
                    + e.stderr)
            continue
        if desc:
            yield desc, str(pair[0])
        else:
            comps = parse_ucs(os.path.basename(pair[1]), catid_list)
            assert comps
            yield comps.fx_name, str(pair[0])
 def ucs_definitions_generator(ucs: list[Ucs]) \
        -> Generator[tuple[str,str],None, None]:
    for cat in ucs:
        yield cat.explanations, cat.catid
        yield ", ".join(cat.synonymns), cat.catid
 def print_dataset_stats(dataset: DatasetDict, catlist: list[str]):
    data_table = []
    data_table.append([["Total records in combined dataset:",  len(dataset)]])
    data_table.append([["Total records in `train`:", len(dataset['train'])]])
    tab = tabulate(data_table)
    print(tab)
 # https://www.sbert.net/docs/sentence_transformer/loss_overview.html 
 def build_sentence_class_dataset(
-        records: Generator[tuple[str, str], Any, None], catlist: list[str]) -> Dataset:
+        records: Iterator[tuple[str, str]], 
        catlist: list[str]) -> DatasetDict:
    """
    Create a new dataset for `records` which contains (sentence, class) pairs.
    The dataset is split into train and test slices.
    :param records: a generator for records that generates pairs of 
        (sentence, catid)
@@ -16,9 +85,12 @@ def build_sentence_class_dataset(
    labels = ClassLabel(names=catlist)
    features = Features({'sentence': Value('string'),
                                                 'class': labels})
    info = DatasetInfo(
            description=f"(sentence, UCS CatID) pairs gathered by the "
-            "ucsinfer tool on {}")
+            "ucsinfer tool on {}", features= features)
    items: list[dict] = []
@@ -26,9 +98,16 @@ def build_sentence_class_dataset(
        items += [{'sentence': obj[0], 'class': obj[1]}]
-    return Dataset.from_list(items, features=Features({'sentence': Value('string'), 
+    whole = Dataset.from_list(items, features=features, info=info)
-                                                         'class': labels}), 
+    
-                             info=info)
+    split_set = whole.train_test_split(0.2)
    test_eval_set = split_set['test'].train_test_split(0.5)
    return DatasetDict({
        'train': split_set['train'],
        'test': test_eval_set['train'],
        'eval': test_eval_set['test']
        })
 # def build_sentence_anchor_dataset() -> Dataset:
--- a/ucsinfer/inference.py
+++ b/ucsinfer/inference.py
@@ -103,7 +103,12 @@ class InferenceContext:
        else:
            print(f"Calculating embeddings for model {self.model_name}...")
-            for cat_defn in self.catlist:
+            # we need to calculate the embeddings for all cats, not just the 
            # ones we're loading for this run
            full_catlist = load_ucs(full_ucs= True)
            for cat_defn in full_catlist:
                embeddings += [{
                    'CatID': cat_defn.catid,
                    'Embedding': self._encode_category(cat_defn)
@@ -113,7 +118,9 @@ class InferenceContext:
            with open(embedding_cache_path, 'wb') as g:
                pickle.dump(embeddings, g)
-        return embeddings
+        whitelisted_cats = [cat.catid for cat in self.catlist]
        return [e for e in embeddings if e['CatID'] in whitelisted_cats]
    def _encode_category(self, cat: Ucs) -> np.ndarray:
        sentence_components = [cat.explanations,
--- a/ucsinfer/recommend.py
+++ b/ucsinfer/recommend.py
@@ -4,8 +4,10 @@ from re import match
 from .inference import InferenceContext
 from tabulate import tabulate
 def print_recommendation(path: str | None, text: str, ctx: InferenceContext, 
-                         interactive_rename: bool):
+                         interactive_rename: bool, recommend_limit=10):
    """
    Print recommendations interactively.
@@ -17,18 +19,19 @@ def print_recommendation(path: str | None, text: str, ctx: InferenceContext,
              `print_recommendation` should be called again with this argument.
            - if retval[2] is a str, this is the catid the user has selected.
    """
-    recs = ctx.classify_text_ranked(text)
+    recs = ctx.classify_text_ranked(text, limit=recommend_limit)
    print("----------")
    if path:
        print(f"Path: {path}")
    print(f"Text: {text or '<None>'}")
    for i, r in enumerate(recs):
        cat, subcat, _ = ctx.lookup_category(r)
        print(f"- {i}: {r} ({cat}-{subcat})")
    if interactive_rename and path is not None:
-        response = input("#, t [text], ?, q > ")
+        response = input("(n#), t, c, b, ?, q > ")
        if m := match(r'^([0-9]+)', response):
            selection = int(m.group(1))
@@ -43,12 +46,27 @@ def print_recommendation(path: str | None, text: str, ctx: InferenceContext,
            text = m.group(1)
            return True, text, None
        elif m := match(r'^c (.+)', response):
            return True, None, m.group(1)
        elif m := match(r'^b (.+)', response):
            expt = []
            for cat in ctx.catlist:
                if cat.catid.startswith(m.group(1)):
                    expt.append([f"{cat.catid}: ({cat.category}-{cat.subcategory})", 
                              cat.explanations])
            print(tabulate(expt, maxcolwidths=80))
            return True, text, None
        elif response.startswith("?"):
            print("""
 Choices:
 - Enter recommendation number to rename file,
- "t [text]" to search for new recommendations based on [text]
+- "t <text>" to search for new recommendations based on <text>
 - "p" re-use the last selected cat-id
 - "c <cat>" to type in a category by hand 
 - "b <cat>" browse category list for categories starting with <cat>
 - "?" for this message
 - "q" to quit
 - or any other key to skip this file and continue to next file
@@ -56,6 +74,8 @@ Choices:
            return True, text, None
        elif response.startswith('q'):
            return (False, None, None)
        else:
            print()
    else:
        return None
Author	SHA1	Message	Date
Jamie Hardt	615d8ab279	Refactoring of gather	2025-09-10 23:49:33 -07:00
Jamie Hardt	9a887c4ed5	Refactoring of gather	2025-09-10 23:38:33 -07:00
Jamie Hardt	63b140209b	Command-line entry point:wq	2025-09-10 22:23:57 -07:00
Jamie Hardt	d181ac73b1	Made prompt shorter	2025-09-10 22:08:16 -07:00
Jamie Hardt	bddce23c76	Added to prompt help	2025-09-10 22:07:53 -07:00
Jamie Hardt	b4758dd138	Some features for recommend, a browse feature	2025-09-06 14:10:53 -07:00
Jamie Hardt	04332b73ee	Fixed a bug in cat masking	2025-09-04 10:54:35 -07:00
Jamie Hardt	103fffe0a4	Added another function to recommend	2025-09-04 10:43:31 -07:00
Jamie Hardt	10519f9c1a	Split training sets from gather	2025-09-03 19:40:55 -07:00