Added better error reporting to gather
And a bunch of options to control behavior
This commit is contained in:
33
ucsinfer/gather.py
Normal file
33
ucsinfer/gather.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from datasets import Dataset, Features, Value, ClassLabel
|
||||
|
||||
from typing import Generator, Any
|
||||
|
||||
# https://www.sbert.net/docs/sentence_transformer/loss_overview.html
|
||||
|
||||
def build_sentence_class_dataset(
|
||||
records: Generator[tuple[str, str], Any, None], catlist: list[str]) -> Dataset:
|
||||
"""
|
||||
Create a new dataset for `records` which contains (sentence, class) pairs.
|
||||
|
||||
:param records: a generator for records that generates pairs of
|
||||
(sentence, catid)
|
||||
:returns: A dataset with two columns: (sentence, hash(catid))
|
||||
"""
|
||||
|
||||
labels = ClassLabel(names=catlist)
|
||||
|
||||
items: list[dict] = []
|
||||
for obj in records:
|
||||
items += [{'sentence': obj[0], 'class': obj[1]}]
|
||||
|
||||
|
||||
return Dataset.from_list(items, features=Features({'sentence': Value('string'),
|
||||
'class': labels}))
|
||||
|
||||
|
||||
# def build_sentence_anchor_dataset() -> Dataset:
|
||||
# """
|
||||
# Create a new dataset for `records` which contains (sentence, anchor) pairs.
|
||||
# """
|
||||
# pass
|
||||
|
Reference in New Issue
Block a user