Twiddles

dataset writing in HF format
tweaks
2025-09-03 14:56:41 -07:00 · 2025-09-03 14:52:10 -07:00 · 2025-09-03 14:23:09 -07:00
2 changed files with 46 additions and 29 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -2,17 +2,15 @@

 - Use History when adding catids

+
 ## Gather

 - Maybe more dataset configurations

-## Validate 
-
-A function for validating a dataset for finetuning

 ## Fine-tune

- Implement
+- Implement BatchAllTripletLoss


 ## Evaluate
@@ -22,6 +20,8 @@ A function for validating a dataset for finetuning
 - Print raw output
 - Maybe load everything into a sqlite for slicker reporting

+
 ## Utility

- Dataset partitioning
+- Clear caches
+
--- a/ucsinfer/main.py
+++ b/ucsinfer/main.py
@@ -1,9 +1,7 @@
 import os
-import sys
 import csv
 import logging

-from typing import Generator

 import tqdm
 import click
@@ -26,10 +24,14 @@ logger.addHandler(stream_handler)
@click.group(epilog="For more information see "
             "<https://git.squad51.us/jamie/ucsinfer>")
@click.option('--verbose', '-v', flag_value=True, help='Verbose output')
+@click.option('--model', type=str, metavar="<model-name>", 
+              default="paraphrase-multilingual-mpnet-base-v2",
+              show_default=True, 
+              help="Select the sentence_transformer model to use")
@click.option('--no-model-cache', flag_value=True, 
              help="Don't use local model cache")
@click.pass_context
-def ucsinfer(ctx, verbose, no_model_cache):
+def ucsinfer(ctx, verbose, no_model_cache, model):
    """
    Tools for applying UCS categories to sounds using large-language Models 
    """
@@ -47,6 +49,12 @@ def ucsinfer(ctx, verbose, no_model_cache):

    ctx.ensure_object(dict)
    ctx.obj['model_cache'] = not no_model_cache
+    ctx.obj['model_name'] = model
+
+    if no_model_cache:
+        logger.info("Model cache inhibited by config")
+
+    logger.info(f"Using model {model}")


@ucsinfer.command('recommend')
@@ -54,10 +62,6 @@ def ucsinfer(ctx, verbose, no_model_cache):
              help="Recommend a category for given text instead of reading "
              "from a file")
@click.argument('paths', nargs=-1, metavar='<paths>')
-@click.option('--model', type=str, metavar="<model-name>", 
-              default="paraphrase-multilingual-mpnet-base-v2",
-              show_default=True, 
-              help="Select the sentence_transformer model to use")
@click.option('--interactive','-i', flag_value=True, default=False, 
              help="After processing each path in <paths>, prompt for a "
              "recommendation to accept, and then prepend the selection to "
@@ -66,7 +70,7 @@ def ucsinfer(ctx, verbose, no_model_cache):
              help="Skip files that already have a UCS category in their "
              "name.")
@click.pass_context
-def recommend(ctx, text, paths, model, interactive, skip_ucs):
+def recommend(ctx, text, paths, interactive, skip_ucs):
    """
    Infer a UCS category for a text description

@@ -77,7 +81,7 @@ def recommend(ctx, text, paths, model, interactive, skip_ucs):
    of ranked subcategories is printed to the terminal for each PATH.
    """
    logger.debug("RECOMMEND mode")
-    inference_ctx = InferenceContext(model, 
+    inference_ctx = InferenceContext(ctx.obj['model_name'], 
                                     use_cached_model=ctx.obj['model_cache'])

    if text is not None:
@@ -113,12 +117,14 @@ def recommend(ctx, text, paths, model, interactive, skip_ucs):
        

@ucsinfer.command('gather')
-@click.option('--outfile', type=click.File(mode='w', encoding='utf8'),
-              default='dataset.csv', show_default=True)
+@click.option('--out', default='dataset/', show_default=True)
+@click.option('--ucs-data', flag_value=True, help="Create a dataset based "
+              "on the UCS category explanations and synonymns (PATHS will "
+              "be ignored.)")
@click.argument('paths', nargs=-1)
-def gather(paths, outfile):
+def gather(paths, out, ucs_data):
    """
-    Scan files to build a training dataset at PATH
+    Scan files to build a training dataset
    
    The `gather` is used to build a training dataset for finetuning the 
    selected model. Description sentences and UCS categories are collected from 
@@ -137,9 +143,14 @@ def gather(paths, outfile):

    logger.debug(f"Loading category list...")
    ucs = load_ucs()
-    catid_list = [cat.catid for cat in ucs]

    scan_list = []
+    catid_list = [cat.catid for cat in ucs]
+
+    if ucs_data:
+        logger.info('Creating dataset for UCS categories instead of from PATH')
+        paths = []
+
    for path in paths:
        logger.info(f"Scanning directory {path}...")
        for dirpath, _, filenames in os.walk(path):
@@ -162,12 +173,23 @@ def gather(paths, outfile):
                assert comps
                yield comps.fx_name, str(pair[0])

-    dataset = build_sentence_class_dataset(scan_metadata())
-    dataset.save_to_disk(outfile)
+    def ucs_metadata():
+        for cat in ucs:
+            yield cat.explanations, cat.catid
+            yield ", ".join(cat.synonymns), cat.catid
+
+    if ucs_data:
+        dataset = build_sentence_class_dataset(ucs_metadata(), catid_list)
+    else:
+        dataset = build_sentence_class_dataset(scan_metadata(), catid_list)
+    
+    logger.info(f"Saving dataset to disk at {out}")
+    dataset.save_to_disk(out)
    

@ucsinfer.command('finetune')
-def finetune():
+@click.pass_context
+def finetune(ctx):
    """
    Fine-tune a model with training data 
    """
@@ -183,14 +205,10 @@ def finetune():
@click.option('--no-foley', 'no_foley', flag_value=True, default=False, 
              help="Ignore any data in the set with FOLYProp or FOLYFeet "
              "category")
-@click.option('--model', type=str, metavar="<model-name>", 
-              default="paraphrase-multilingual-mpnet-base-v2",
-              show_default=True, 
-              help="Select the sentence_transformer model to use")
@click.argument('dataset', type=click.File('r', encoding='utf8'),
                default='dataset.csv')
@click.pass_context
-def evaluate(ctx, dataset, offset, limit, model, no_foley):
+def evaluate(ctx, dataset, offset, limit, no_foley):
    """
    Use datasets to evaluate model performance 

@@ -211,12 +229,11 @@ def evaluate(ctx, dataset, offset, limit, model, no_foley):
    foley, and so these categories can be excluded with the --no-foley option.
    """
    logger.debug("EVALUATE mode")
-    inference_context = InferenceContext(model, 
+    inference_context = InferenceContext(ctx.obj['model_name'], 
                                         use_cached_model=
                                         ctx.obj['model_cache'])
    reader = csv.reader(dataset)

-    logger.info(f"Evaluating model {model}...")
    results = []
    
    if offset > 0:
Author	SHA1	Message	Date
Jamie Hardt	fee55a7d5a	Twiddles	2025-09-03 14:56:41 -07:00
Jamie Hardt	0594899bdd	dataset writing in HF format	2025-09-03 14:52:10 -07:00
Jamie Hardt	46a693bf93	tweaks	2025-09-03 14:23:09 -07:00