Rewriting structure, added to TODO
This commit is contained in:
10
TODO.md
10
TODO.md
@@ -8,9 +8,19 @@
|
|||||||
- Maybe more dataset configurations
|
- Maybe more dataset configurations
|
||||||
|
|
||||||
|
|
||||||
|
## Qualify
|
||||||
|
|
||||||
|
- Print stats for a dataset
|
||||||
|
|
||||||
## Fine-tune
|
## Fine-tune
|
||||||
|
|
||||||
|
- https://www.sbert.net/docs/sentence_transformer/loss_overview.html#loss-table
|
||||||
|
- Use (anchor, positive) pairs to train a new model
|
||||||
|
- Use (sentence) + class labels to train a new model
|
||||||
- Implement BatchAllTripletLoss
|
- Implement BatchAllTripletLoss
|
||||||
|
- Implement a two-phase training regime
|
||||||
|
- 1. Train with anchored definitions then...
|
||||||
|
- 2. Train with class labels
|
||||||
|
|
||||||
|
|
||||||
## Evaluate
|
## Evaluate
|
||||||
|
|||||||
@@ -139,18 +139,18 @@ def recommend(ctx, text, paths, interactive, skip_ucs):
|
|||||||
|
|
||||||
@ucsinfer.command('gather')
|
@ucsinfer.command('gather')
|
||||||
@click.option('--out', default='dataset/', show_default=True)
|
@click.option('--out', default='dataset/', show_default=True)
|
||||||
@click.option('--ucs-data', flag_value=True, help="Create a dataset based "
|
# @click.option('--ucs-data', flag_value=True, help="Create a dataset based "
|
||||||
"on the UCS category explanations and synonymns (PATHS will "
|
# "on the UCS category explanations and synonymns (PATHS will "
|
||||||
"be ignored.)")
|
# "be ignored.)")
|
||||||
@click.argument('paths', nargs=-1)
|
@click.argument('paths', nargs=-1)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def gather(ctx, paths, out, ucs_data):
|
def gather(ctx, paths, out, ucs_data):
|
||||||
"""
|
"""
|
||||||
Scan files to build a training dataset
|
Scan files to build a training dataset
|
||||||
|
|
||||||
The `gather` is used to build a training dataset for finetuning the
|
`gather` is used to build a training dataset for finetuning the selected
|
||||||
selected model. Description sentences and UCS categories are collected from
|
model. Description sentences and UCS categories are collected from '.wav'
|
||||||
'.wav' and '.flac' files on-disk that have valid UCS filenames and assigned
|
and '.flac' files on-disk that have valid UCS filenames and assigned
|
||||||
CatIDs, and this information is recorded into a HuggingFace dataset.
|
CatIDs, and this information is recorded into a HuggingFace dataset.
|
||||||
|
|
||||||
Gather scans the filesystem in two passes: first, the directory tree is
|
Gather scans the filesystem in two passes: first, the directory tree is
|
||||||
@@ -187,6 +187,17 @@ def gather(ctx, paths, out, ucs_data):
|
|||||||
print_dataset_stats(dataset, catid_list)
|
print_dataset_stats(dataset, catid_list)
|
||||||
dataset.save_to_disk(out)
|
dataset.save_to_disk(out)
|
||||||
|
|
||||||
|
@ucsinfer.command('qualify')
|
||||||
|
def qualify():
|
||||||
|
"""
|
||||||
|
Check and prepare a dataset for finetuning
|
||||||
|
|
||||||
|
`quality` reads a dataset and will output statistics on its coverage of the
|
||||||
|
UCS, and will add the UCS canoncial definitions to the dataset for every
|
||||||
|
extant category.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ucsinfer.command('finetune')
|
@ucsinfer.command('finetune')
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
|
|||||||
Reference in New Issue
Block a user