diff --git a/ucsinfer/gather.py b/ucsinfer/gather.py index e355ecb..1700f52 100644 --- a/ucsinfer/gather.py +++ b/ucsinfer/gather.py @@ -1,4 +1,4 @@ -from datasets import Dataset, Features, Value, ClassLabel +from datasets import Dataset, Features, Value, ClassLabel, DatasetInfo from typing import Generator, Any @@ -16,13 +16,19 @@ def build_sentence_class_dataset( labels = ClassLabel(names=catlist) + info = DatasetInfo( + description=f"(sentence, UCS CatID) pairs gathered by the " + "ucsinfer tool on {}") + + items: list[dict] = [] for obj in records: items += [{'sentence': obj[0], 'class': obj[1]}] return Dataset.from_list(items, features=Features({'sentence': Value('string'), - 'class': labels})) + 'class': labels}), + info=info) # def build_sentence_anchor_dataset() -> Dataset: