From e419f698c94fa8786b5ff22fb0766ff3d069e908 Mon Sep 17 00:00:00 2001 From: Jamie Hardt Date: Wed, 3 Sep 2025 16:31:05 -0700 Subject: [PATCH] Dataset metadata --- ucsinfer/gather.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ucsinfer/gather.py b/ucsinfer/gather.py index e355ecb..1700f52 100644 --- a/ucsinfer/gather.py +++ b/ucsinfer/gather.py @@ -1,4 +1,4 @@ -from datasets import Dataset, Features, Value, ClassLabel +from datasets import Dataset, Features, Value, ClassLabel, DatasetInfo from typing import Generator, Any @@ -16,13 +16,19 @@ def build_sentence_class_dataset( labels = ClassLabel(names=catlist) + info = DatasetInfo( + description=f"(sentence, UCS CatID) pairs gathered by the " + "ucsinfer tool on {}") + + items: list[dict] = [] for obj in records: items += [{'sentence': obj[0], 'class': obj[1]}] return Dataset.from_list(items, features=Features({'sentence': Value('string'), - 'class': labels})) + 'class': labels}), + info=info) # def build_sentence_anchor_dataset() -> Dataset: