Work on models.md

2025-08-26 18:07:42 -07:00
parent c1734e3924
commit 8962124795
3 changed files with 27 additions and 36 deletions
--- a/MODELS.md
+++ b/MODELS.md
@@ -0,0 +1,16 @@
+# Models
+
+## Results for Model paraphrase-multilingual-MiniLM-L12-v2 ##
+
+(FOLYProp and FOLYFeet have been omitted from the dataset.)
+
+|                                |    n | pct    |
+|--------------------------------|------|--------|
+| Total records in sample:       | 2335 |        |
+| Top Result:                    |  415 | 17.77% |
+| Top 5 Result:                  |  545 | 23.34% |
+| Top 10 Result:                 |  419 | 17.94% |
+|  |
+| UCS category count:            |  752 |        |
+| Total categories in sample:    |  238 | 31.65% |
+| Most missed category (BLLTBy): |  140 | 6.00%  |
--- a/MODELS.rst
+++ b/MODELS.rst
@@ -1,32 +0,0 @@
-Results for Model paraphrase-multilingual-mpnet-base-v2
-=====
-
-================================  ====  ======
-..                                   n  pct
-================================  ====  ======
-Total records in sample:          3445
-Top Result:                        469  13.61%
-Top 5 Result:                      519  15.07%
-Top 10 Result:                     513  14.89%
-================================  ====  ======
-UCS category count:                752
-Total categories in sample:        240  31.91%
-Most missed category (FOLYProp):  1057  30.68%
-================================  ====  ======
-
-
-Results for Model paraphrase-multilingual-MiniLM-L12-v2
-=====
-
-================================  ====  ======
-..                                   n  pct
-================================  ====  ======
-Total records in sample:          3445
-Top Result:                        418  12.13%
-Top 5 Result:                      559  16.23%
-Top 10 Result:                     433  12.57%
-================================  ====  ======
-UCS category count:                752
-Total categories in sample:        240  31.91%
-Most missed category (FOLYProp):  1047  30.39%
-================================  ====  ======
--- a/ucsinfer/main.py
+++ b/ucsinfer/main.py
@@ -68,7 +68,9 @@ def finetune():
@ucsinfer.command('evaluate')
@click.option('--offset', type=int, default=0)
@click.option('--limit', type=int, default=-1)
-@click.option('--no-foley', type=bool, default=False)
+@click.option('--no-foley', 'no_foley', flag_value=True, default=False, 
+              help="Ignore any data in the set with FOLYProp or FOLYFeet "
+              "category")
@click.option('--model', type=str, 
              default="paraphrase-multilingual-mpnet-base-v2")
@click.argument('dataset', type=click.File('r', encoding='utf8'),
@@ -90,6 +92,9 @@ def evaluate(dataset, offset, limit, model, no_foley):
            break

        cat_id, description = row
+        if no_foley and cat_id in ['FOLYProp', 'FOLYFeet']:
+            continue 
+
        guesses = ctx.classify_text_ranked(description, limit=10)
        if cat_id == guesses[0]:
            results.append({'catid': cat_id, 'result': "TOP"})
@@ -116,8 +121,10 @@ def evaluate(dataset, offset, limit, model, no_foley):

    miss_counts = sorted(miss_counts, key=lambda x: x[1])

-    print(f"Results for Model {model}")
-    print("=====\n")
+    print(f"## Results for Model {model} ##")
+
+    if no_foley:
+        print("(FOLYProp and FOLYFeet have been omitted from the dataset.)\n")

    table = [
        ["Total records in sample:", f"{total}"],
@@ -136,7 +143,7 @@ def evaluate(dataset, offset, limit, model, no_foley):
         f"{float(miss_counts[-1][1])/float(total):.2%}"]
    ]

-    print(tabulate(table, headers=['', 'n', 'pct'], tablefmt='rst'))
+    print(tabulate(table, headers=['', 'n', 'pct'], tablefmt='github'))


 if __name__ == '__main__':