Rewriting structure, added to TODO

2025-09-13 23:03:29 -07:00
parent eb1245a64a
commit c15c499869
2 changed files with 28 additions and 7 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -8,9 +8,19 @@
 - Maybe more dataset configurations
 ## Qualify
 - Print stats for a dataset
 ## Fine-tune
 - https://www.sbert.net/docs/sentence_transformer/loss_overview.html#loss-table 
  - Use (anchor, positive) pairs to train a new model 
  - Use (sentence) + class labels to train a new model
 - Implement BatchAllTripletLoss
 - Implement a two-phase training regime 
  - 1. Train with anchored definitions then...
  - 2. Train with class labels
 ## Evaluate
--- a/ucsinfer/main.py
+++ b/ucsinfer/main.py
@@ -139,18 +139,18 @@ def recommend(ctx, text, paths, interactive, skip_ucs):
@ucsinfer.command('gather')
@click.option('--out', default='dataset/', show_default=True)
-@click.option('--ucs-data', flag_value=True, help="Create a dataset based "
+# @click.option('--ucs-data', flag_value=True, help="Create a dataset based "
-              "on the UCS category explanations and synonymns (PATHS will "
+#               "on the UCS category explanations and synonymns (PATHS will "
-              "be ignored.)")
+#               "be ignored.)")
@click.argument('paths', nargs=-1)
@click.pass_context
 def gather(ctx, paths, out, ucs_data):
    """
    Scan files to build a training dataset
-    The `gather` is used to build a training dataset for finetuning the 
+    `gather` is used to build a training dataset for finetuning the selected
-    selected model. Description sentences and UCS categories are collected from 
+    model. Description sentences and UCS categories are collected from '.wav'
-    '.wav' and '.flac' files on-disk that have valid UCS filenames and assigned 
+    and '.flac' files on-disk that have valid UCS filenames and assigned
    CatIDs, and this information is recorded into a HuggingFace dataset.
    Gather scans the filesystem in two passes: first, the directory tree is 
@@ -186,7 +186,18 @@ def gather(ctx, paths, out, ucs_data):
    logger.info(f"Saving dataset to disk at {out}")
    print_dataset_stats(dataset, catid_list)
    dataset.save_to_disk(out)
-    
+
@ucsinfer.command('qualify')
 def qualify():
    """
    Check and prepare a dataset for finetuning
    `quality` reads a dataset and will output statistics on its coverage of the 
    UCS, and will add the UCS canoncial definitions to the dataset for every 
    extant category.
    """
    pass
@ucsinfer.command('finetune')
@click.pass_context