Rewriting structure, added to TODO

2025-09-13 23:03:29 -07:00
parent eb1245a64a
commit c15c499869
2 changed files with 28 additions and 7 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -8,9 +8,19 @@
 - Maybe more dataset configurations


+## Qualify
+
+- Print stats for a dataset
+
 ## Fine-tune

+- https://www.sbert.net/docs/sentence_transformer/loss_overview.html#loss-table 
+  - Use (anchor, positive) pairs to train a new model 
+  - Use (sentence) + class labels to train a new model
 - Implement BatchAllTripletLoss
+- Implement a two-phase training regime 
+  - 1. Train with anchored definitions then...
+  - 2. Train with class labels


 ## Evaluate
--- a/ucsinfer/main.py
+++ b/ucsinfer/main.py
@@ -139,18 +139,18 @@ def recommend(ctx, text, paths, interactive, skip_ucs):

@ucsinfer.command('gather')
@click.option('--out', default='dataset/', show_default=True)
-@click.option('--ucs-data', flag_value=True, help="Create a dataset based "
-              "on the UCS category explanations and synonymns (PATHS will "
-              "be ignored.)")
+# @click.option('--ucs-data', flag_value=True, help="Create a dataset based "
+#               "on the UCS category explanations and synonymns (PATHS will "
+#               "be ignored.)")
@click.argument('paths', nargs=-1)
@click.pass_context
 def gather(ctx, paths, out, ucs_data):
    """
    Scan files to build a training dataset
    
-    The `gather` is used to build a training dataset for finetuning the 
-    selected model. Description sentences and UCS categories are collected from 
-    '.wav' and '.flac' files on-disk that have valid UCS filenames and assigned 
+    `gather` is used to build a training dataset for finetuning the selected
+    model. Description sentences and UCS categories are collected from '.wav'
+    and '.flac' files on-disk that have valid UCS filenames and assigned
    CatIDs, and this information is recorded into a HuggingFace dataset.
    
    Gather scans the filesystem in two passes: first, the directory tree is 
@@ -186,7 +186,18 @@ def gather(ctx, paths, out, ucs_data):
    logger.info(f"Saving dataset to disk at {out}")
    print_dataset_stats(dataset, catid_list)
    dataset.save_to_disk(out)
-    
+
+@ucsinfer.command('qualify')
+def qualify():
+    """
+    Check and prepare a dataset for finetuning
+
+    `quality` reads a dataset and will output statistics on its coverage of the 
+    UCS, and will add the UCS canoncial definitions to the dataset for every 
+    extant category.
+    """
+    pass
+

@ucsinfer.command('finetune')
@click.pass_context