Rewriting structure, added to TODO

This commit is contained in:
2025-09-13 23:03:29 -07:00
parent eb1245a64a
commit c15c499869
2 changed files with 28 additions and 7 deletions

10
TODO.md
View File

@@ -8,9 +8,19 @@
- Maybe more dataset configurations - Maybe more dataset configurations
## Qualify
- Print stats for a dataset
## Fine-tune ## Fine-tune
- https://www.sbert.net/docs/sentence_transformer/loss_overview.html#loss-table
- Use (anchor, positive) pairs to train a new model
- Use (sentence) + class labels to train a new model
- Implement BatchAllTripletLoss - Implement BatchAllTripletLoss
- Implement a two-phase training regime
- 1. Train with anchored definitions then...
- 2. Train with class labels
## Evaluate ## Evaluate

View File

@@ -139,18 +139,18 @@ def recommend(ctx, text, paths, interactive, skip_ucs):
@ucsinfer.command('gather') @ucsinfer.command('gather')
@click.option('--out', default='dataset/', show_default=True) @click.option('--out', default='dataset/', show_default=True)
@click.option('--ucs-data', flag_value=True, help="Create a dataset based " # @click.option('--ucs-data', flag_value=True, help="Create a dataset based "
"on the UCS category explanations and synonymns (PATHS will " # "on the UCS category explanations and synonymns (PATHS will "
"be ignored.)") # "be ignored.)")
@click.argument('paths', nargs=-1) @click.argument('paths', nargs=-1)
@click.pass_context @click.pass_context
def gather(ctx, paths, out, ucs_data): def gather(ctx, paths, out, ucs_data):
""" """
Scan files to build a training dataset Scan files to build a training dataset
The `gather` is used to build a training dataset for finetuning the `gather` is used to build a training dataset for finetuning the selected
selected model. Description sentences and UCS categories are collected from model. Description sentences and UCS categories are collected from '.wav'
'.wav' and '.flac' files on-disk that have valid UCS filenames and assigned and '.flac' files on-disk that have valid UCS filenames and assigned
CatIDs, and this information is recorded into a HuggingFace dataset. CatIDs, and this information is recorded into a HuggingFace dataset.
Gather scans the filesystem in two passes: first, the directory tree is Gather scans the filesystem in two passes: first, the directory tree is
@@ -186,7 +186,18 @@ def gather(ctx, paths, out, ucs_data):
logger.info(f"Saving dataset to disk at {out}") logger.info(f"Saving dataset to disk at {out}")
print_dataset_stats(dataset, catid_list) print_dataset_stats(dataset, catid_list)
dataset.save_to_disk(out) dataset.save_to_disk(out)
@ucsinfer.command('qualify')
def qualify():
"""
Check and prepare a dataset for finetuning
`quality` reads a dataset and will output statistics on its coverage of the
UCS, and will add the UCS canoncial definitions to the dataset for every
extant category.
"""
pass
@ucsinfer.command('finetune') @ucsinfer.command('finetune')
@click.pass_context @click.pass_context