Autopep

2025-08-26 17:14:56 -07:00
parent 3d67623d77
commit 5ea64d089f
3 changed files with 57 additions and 54 deletions
@@ -22,7 +22,7 @@ def recommend():
    """
    Infer a UCS category for a text description
    """
-    pass 
+    pass


@ucsinfer.command('gather')
@@ -36,7 +36,7 @@ def gather(paths, outfile):
    types = ['.wav', '.flac']
    table = csv.writer(outfile)
    print(f"Loading category list...")
-    catid_list = [cat.catid for cat in load_ucs()] 
+    catid_list = [cat.catid for cat in load_ucs()]

    scan_list = []
    for path in paths:
@@ -47,12 +47,12 @@ def gather(paths, outfile):
                if ext in types and \
                        (ucs_components := parse_ucs(root, catid_list)) and \
                        not filename.startswith("._"):
-                    scan_list.append((ucs_components.cat_id, 
+                    scan_list.append((ucs_components.cat_id,
                                      os.path.join(dirpath, filename)))

    print(f"Found {len(scan_list)} files to process.")

-    for pair in tqdm.tqdm(scan_list, unit='files',file=sys.stderr):
+    for pair in tqdm.tqdm(scan_list, unit='files', file=sys.stderr):
        if desc := ffmpeg_description(pair[1]):
            table.writerow([pair[0], desc])

@@ -62,13 +62,13 @@ def finetune():
    """
    Fine-tune a model with training data 
    """
-    pass 
+    pass


@ucsinfer.command('evaluate')
@click.option('--offset', type=int, default=0)
@click.option('--limit', type=int, default=-1)
-@click.argument('dataset', type=click.File('r', encoding='utf8'), 
+@click.argument('dataset', type=click.File('r', encoding='utf8'),
                default='dataset.csv')
 def evaluate(dataset, offset, limit):
    """
@@ -82,7 +82,7 @@ def evaluate(dataset, offset, limit):
    for i, row in enumerate(tqdm.tqdm(reader)):
        if i < offset:
            continue
-        
+
        if limit > 0 and i >= limit + offset:
            break

@@ -107,33 +107,33 @@ def evaluate(dataset, offset, limit):

    miss_counts = []
    for cat in cats:
-        miss_counts.append((cat, len([x for x in results \
-                if x['catid'] == cat and x['result'] == 'MISS'])))
+        miss_counts.append((cat, len([x for x in results
+                                      if x['catid'] == cat and x['result'] == 'MISS'])))

    miss_counts = sorted(miss_counts, key=lambda x: x[1])

    print(f" === RESULTS === ")
-    
+
    table = [
-                ["Total records in sample:", f"{total}"],
-                ["Top Result:", f"{total_top}", 
-                 f"{float(total_top)/float(total):.2%}"],
-                ["Top 5 Result:", f"{total_top_5}", 
-                    f"{float(total_top_5)/float(total):.2%}"],
-                ["Top 10 Result:", f"{total_top_10}", 
-                    f"{float(total_top_10)/float(total):.2%}"],
-                SEPARATING_LINE,
-                ["UCS category count:", f"{len(ctx.catlist)}"],
-                ["Total categories in sample:", f"{total_cats}", 
-                    f"{float(total_cats)/float(len(ctx.catlist)):.2%}"],
-                [f"Most missed category ({miss_counts[-1][0]}):",
-                 f"{miss_counts[-1][1]}", 
-                 f"{float(miss_counts[-1][1])/float(total):.2%}"]
-            ]
+        ["Total records in sample:", f"{total}"],
+        ["Top Result:", f"{total_top}",
+         f"{float(total_top)/float(total):.2%}"],
+        ["Top 5 Result:", f"{total_top_5}",
+         f"{float(total_top_5)/float(total):.2%}"],
+        ["Top 10 Result:", f"{total_top_10}",
+         f"{float(total_top_10)/float(total):.2%}"],
+        SEPARATING_LINE,
+        ["UCS category count:", f"{len(ctx.catlist)}"],
+        ["Total categories in sample:", f"{total_cats}",
+         f"{float(total_cats)/float(len(ctx.catlist)):.2%}"],
+        [f"Most missed category ({miss_counts[-1][0]}):",
+         f"{miss_counts[-1][1]}",
+         f"{float(miss_counts[-1][1])/float(total):.2%}"]
+    ]
+
+    print(tabulate(table, headers=['', 'n', 'pct']))

-    print(tabulate(table, headers=['','n','pct']))

- 
 if __name__ == '__main__':
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'

@@ -11,6 +11,7 @@ import platformdirs

 from sentence_transformers import SentenceTransformer

+
 def classify_text_ranked(text, embeddings_list, model, limit=5):
    text_embedding = model.encode(text, convert_to_numpy=True)
    embeddings = np.array([info['Embedding'] for info in embeddings_list])
@@ -23,15 +24,16 @@ class Ucs(NamedTuple):
    catid: str
    category: str
    subcategory: str
-    explanations: str 
+    explanations: str
    synonymns: list[str]

    @classmethod
    def from_dict(cls, d: dict):
-        return Ucs(catid=d['CatID'], category=d['Category'], 
-                   subcategory=d['SubCategory'], 
+        return Ucs(catid=d['CatID'], category=d['Category'],
+                   subcategory=d['SubCategory'],
                   explanations=d['Explanations'], synonymns=d['Synonyms'])

+
 def load_ucs() -> list[Ucs]:
    FILE_ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
    cats = []
@@ -43,6 +45,7 @@ def load_ucs() -> list[Ucs]:

    return [Ucs.from_dict(cat) for cat in cats]

+
 class InferenceContext:
    """
    Maintains caches and resources for UCS category inference.
@@ -72,9 +75,9 @@ class InferenceContext:

            for cat_defn in self.catlist:
                embeddings += [{
-                        'CatID': cat_defn.catid,
-                        'Embedding': self._encode_category(cat_defn)
-                               }]
+                    'CatID': cat_defn.catid,
+                    'Embedding': self._encode_category(cat_defn)
+                }]

            os.makedirs(os.path.dirname(embedding_cache), exist_ok=True)
            with open(embedding_cache, 'wb') as g:
@@ -83,10 +86,10 @@ class InferenceContext:
        return embeddings

    def _encode_category(self, cat: Ucs) -> np.ndarray:
-        sentence_components = [cat.explanations, 
-                           cat.category, 
-                           cat.subcategory
-                           ]
+        sentence_components = [cat.explanations,
+                               cat.category,
+                               cat.subcategory
+                               ]
        sentence_components += cat.synonymns
        sentence = ", ".join(sentence_components)
        return self.model.encode(sentence, convert_to_numpy=True)
@@ -104,13 +107,12 @@ class InferenceContext:
    def lookup_category(self, catid) -> tuple[str, str, str]:
        """
        Get the category, subcategory and explanations phrase for a `catid`
-        
+
        :raises: StopIterator if CatId is not on the schedule 
        """
        i = (
-                (x.category, x.subcategory, x.explanations) \
-                        for x in self.catlist if x.catid == catid
-                        )
-
-        return next(i) 
+            (x.category, x.subcategory, x.explanations)
+            for x in self.catlist if x.catid == catid
+        )

+        return next(i)
@@ -1,20 +1,21 @@
 import subprocess
 import json
-from typing import NamedTuple, Optional 
+from typing import NamedTuple, Optional
 from re import match


-from .inference import Ucs 
+from .inference import Ucs
+

 def ffmpeg_description(path: str) -> Optional[str]:
-    result = subprocess.run(['ffprobe', '-show_format', '-of', 
+    result = subprocess.run(['ffprobe', '-show_format', '-of',
                             'json', path], capture_output=True)

    try:
        result.check_returncode()
    except:
        return None
-        
+
    stream = json.loads(result.stdout)
    fmt = stream.get("format", None)
    if fmt:
@@ -28,10 +29,10 @@ class UcsNameComponents(NamedTuple):
    Components of a UCS filename
    """
    cat_id: str
-    user_cat: str | None 
+    user_cat: str | None
    vendor_cat: str | None
    fx_name: str
-    creator: str | None 
+    creator: str | None
    source: str | None
    user_data: str | None

@@ -43,7 +44,7 @@ class UcsNameComponents(NamedTuple):
            return False

        if self.user_cat and not match(r"[^\-_]+", self.user_cat):
-            return False 
+            return False

        if self.vendor_cat and not match(r"[^\-_]+", self.vendor_cat):
            return False
@@ -52,7 +53,7 @@ class UcsNameComponents(NamedTuple):
            return False

        if self.creator and not match(r"[^_]+", self.creator):
-            return False 
+            return False

        if self.source and not match(r"[^_]+", self.source):
            return False
@@ -73,7 +74,7 @@ def build_ucs(components: UcsNameComponents, extension: str) -> str:
 def parse_ucs(rootname: str, catid_list: list[str]) -> Optional[UcsNameComponents]:
    """
    Parse the UCS components from a file name root.
-    
+
    :param rootname: filename root, the basename of the file without extension 
    :param catid_list: a list of all UCS CatIDs
    :returns: the components, or `None` if the filename is not in UCS format
@@ -82,8 +83,8 @@ def parse_ucs(rootname: str, catid_list: list[str]) -> Optional[UcsNameComponent
    regexp1 = r"^(?P<CatID>[A-z]+)(-(?P<UserCat>[^_]+))?_((?P<VendorCat>[^-]+)-)?(?P<FXName>[^_]+)"

    regexp2 = r"(_(?P<CreatorID>[^_]+)(_(?P<SourceID>[^_]+)(_(?P<UserData>[^.]+))?)?)?"
-    
-    regexp = regexp1 + regexp2 
+
+    regexp = regexp1 + regexp2

    matches = match(regexp, rootname)