Rewriting structure, added to TODO
This commit is contained in:
10
TODO.md
10
TODO.md
@@ -8,9 +8,19 @@
|
||||
- Maybe more dataset configurations
|
||||
|
||||
|
||||
## Qualify
|
||||
|
||||
- Print stats for a dataset
|
||||
|
||||
## Fine-tune
|
||||
|
||||
- https://www.sbert.net/docs/sentence_transformer/loss_overview.html#loss-table
|
||||
- Use (anchor, positive) pairs to train a new model
|
||||
- Use (sentence) + class labels to train a new model
|
||||
- Implement BatchAllTripletLoss
|
||||
- Implement a two-phase training regime
|
||||
- 1. Train with anchored definitions then...
|
||||
- 2. Train with class labels
|
||||
|
||||
|
||||
## Evaluate
|
||||
|
||||
@@ -139,18 +139,18 @@ def recommend(ctx, text, paths, interactive, skip_ucs):
|
||||
|
||||
@ucsinfer.command('gather')
|
||||
@click.option('--out', default='dataset/', show_default=True)
|
||||
@click.option('--ucs-data', flag_value=True, help="Create a dataset based "
|
||||
"on the UCS category explanations and synonymns (PATHS will "
|
||||
"be ignored.)")
|
||||
# @click.option('--ucs-data', flag_value=True, help="Create a dataset based "
|
||||
# "on the UCS category explanations and synonymns (PATHS will "
|
||||
# "be ignored.)")
|
||||
@click.argument('paths', nargs=-1)
|
||||
@click.pass_context
|
||||
def gather(ctx, paths, out, ucs_data):
|
||||
"""
|
||||
Scan files to build a training dataset
|
||||
|
||||
The `gather` is used to build a training dataset for finetuning the
|
||||
selected model. Description sentences and UCS categories are collected from
|
||||
'.wav' and '.flac' files on-disk that have valid UCS filenames and assigned
|
||||
`gather` is used to build a training dataset for finetuning the selected
|
||||
model. Description sentences and UCS categories are collected from '.wav'
|
||||
and '.flac' files on-disk that have valid UCS filenames and assigned
|
||||
CatIDs, and this information is recorded into a HuggingFace dataset.
|
||||
|
||||
Gather scans the filesystem in two passes: first, the directory tree is
|
||||
@@ -186,7 +186,18 @@ def gather(ctx, paths, out, ucs_data):
|
||||
logger.info(f"Saving dataset to disk at {out}")
|
||||
print_dataset_stats(dataset, catid_list)
|
||||
dataset.save_to_disk(out)
|
||||
|
||||
|
||||
@ucsinfer.command('qualify')
|
||||
def qualify():
|
||||
"""
|
||||
Check and prepare a dataset for finetuning
|
||||
|
||||
`quality` reads a dataset and will output statistics on its coverage of the
|
||||
UCS, and will add the UCS canoncial definitions to the dataset for every
|
||||
extant category.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@ucsinfer.command('finetune')
|
||||
@click.pass_context
|
||||
|
||||
Reference in New Issue
Block a user