Plumbing for CSV import from online logs

TODO
2025-09-26 21:08:41 -07:00 · 2025-09-26 20:52:08 -07:00
2 changed files with 23 additions and 7 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -18,9 +18,8 @@
  - Use (anchor, positive) pairs to train a new model 
  - Use (sentence) + class labels to train a new model
 - Implement BatchAllTripletLoss
- Implement a two-phase training regime 
-  1. Train with anchored definitions then...
-  2. Train with class labels
+- Train with anchored definitions and/or...
+- Train with class labels


 ## Evaluate
--- a/ucsinfer/main.py
+++ b/ucsinfer/main.py
@@ -1,11 +1,8 @@
 import os
-# import csv
 import logging
 from itertools import chain

-import tqdm
 import click
-# from tabulate import tabulate, SEPARATING_LINE

 from .inference import InferenceContext, load_ucs
 from .gather import (build_sentence_class_dataset, print_dataset_stats, 
@@ -136,6 +133,26 @@ def recommend(ctx, text, paths, interactive, skip_ucs):
                os.rename(path, new_path)
                break
        
+@ucsinfer.command('csv')
+@click.option('--filename-col', default="FileName", 
+              help="Heading or index of the column containing filenames",
+              show_default=True)
+@click.option('--description-col', default="TrackDescription", 
+              help="Heading or index of the column containing descriptions",
+              show_default=True)
+@click.option('--out', default='dataset/', show_default=True)
+@click.argument('paths', nargs=-1)
+@click.pass_context
+def csv(ctx, paths, out, filename_col, description_col):
+    """
+    Scan training data from CSV files 
+
+    `csv` is used to build a training dataset for finetuning the selected 
+    model, as like the `gather` command, except instead of scanning the 
+    file system it builds a dataset from descriptions and UCS filenames in 
+    columns of a CSV file.
+    """
+    pass

@ucsinfer.command('gather')
@click.option('--out', default='dataset/', show_default=True)
@@ -146,7 +163,7 @@ def recommend(ctx, text, paths, interactive, skip_ucs):
@click.pass_context
 def gather(ctx, paths, out, ucs_data):
    """
-    Scan files to build a training dataset
+    Scan training data from audio files
    
    `gather` is used to build a training dataset for finetuning the selected
    model. Description sentences and UCS categories are collected from '.wav'
Author	SHA1	Message	Date
Jamie Hardt	e5698fec7b	Plumbing for CSV import from online logs	2025-09-26 21:08:41 -07:00
Jamie Hardt	c75365b856	TODO	2025-09-26 20:52:08 -07:00