Added better error reporting to gather

And a bunch of options to control behavior
This commit is contained in:
2025-09-03 16:07:41 -07:00
parent fee55a7d5a
commit b833d6d3c0
4 changed files with 165 additions and 103 deletions

33
ucsinfer/gather.py Normal file
View File

@@ -0,0 +1,33 @@
from datasets import Dataset, Features, Value, ClassLabel
from typing import Generator, Any
# https://www.sbert.net/docs/sentence_transformer/loss_overview.html
def build_sentence_class_dataset(
records: Generator[tuple[str, str], Any, None], catlist: list[str]) -> Dataset:
"""
Create a new dataset for `records` which contains (sentence, class) pairs.
:param records: a generator for records that generates pairs of
(sentence, catid)
:returns: A dataset with two columns: (sentence, hash(catid))
"""
labels = ClassLabel(names=catlist)
items: list[dict] = []
for obj in records:
items += [{'sentence': obj[0], 'class': obj[1]}]
return Dataset.from_list(items, features=Features({'sentence': Value('string'),
'class': labels}))
# def build_sentence_anchor_dataset() -> Dataset:
# """
# Create a new dataset for `records` which contains (sentence, anchor) pairs.
# """
# pass