Added better error reporting to gather

And a bunch of options to control behavior
2025-09-03 16:07:41 -07:00
parent fee55a7d5a
commit b833d6d3c0
4 changed files with 165 additions and 103 deletions
--- a/ucsinfer/gather.py
+++ b/ucsinfer/gather.py
@@ -0,0 +1,33 @@
+from datasets import Dataset, Features, Value, ClassLabel
+
+from typing import Generator, Any
+
+# https://www.sbert.net/docs/sentence_transformer/loss_overview.html 
+
+def build_sentence_class_dataset(
+        records: Generator[tuple[str, str], Any, None], catlist: list[str]) -> Dataset:
+    """
+    Create a new dataset for `records` which contains (sentence, class) pairs.
+
+    :param records: a generator for records that generates pairs of 
+        (sentence, catid)
+    :returns: A dataset with two columns: (sentence, hash(catid))
+    """
+
+    labels = ClassLabel(names=catlist)
+    
+    items: list[dict] = []
+    for obj in records:
+        items += [{'sentence': obj[0], 'class': obj[1]}]
+        
+
+    return Dataset.from_list(items, features=Features({'sentence': Value('string'), 
+                                                         'class': labels}))
+
+
+# def build_sentence_anchor_dataset() -> Dataset:
+#     """
+#     Create a new dataset for `records` which contains (sentence, anchor) pairs.
+#     """
+#     pass
+