Files
ucsinfer/notebooks/00_Infer.ipynb
2025-08-03 15:03:54 -07:00

5.3 KiB

In [1]:
categories = [
    {
        "name": "Automobile",
        "description": "Topics related to vehicles such as cars, trucks, and their brands.",
        "keywords": ["Mazda", "Toyota", "SUV", "sedan", "pickup"]
    },
    {
        "name": "Firearms",
        "description": "Topics related to guns, rifles, pistols, ammunition, and other weapons.",
        "keywords": ["Winchester", "Glock", "rifle", "bullet", "shotgun"]
    },
    {
        "name": "Computers",
        "description": "Topics involving computer hardware and software, such as hard drives, CPUs, and laptops.",
        "keywords": ["Winchester", "CPU", "hard drive", "RAM", "SSD", "motherboard"]
    },
]
In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
from numpy.linalg import norm

model = SentenceTransformer("all-MiniLM-L6-v2")

def build_category_embedding(cat_info):
    components = [cat_info["name"], cat_info["description"]] + cat_info.get('keywords', [])
    composite_text = ". ".join(components)
    return model.encode(composite_text, convert_to_numpy=True)

# Embed all categories

for info in categories:
    info['embedding'] = build_category_embedding(info)
/Users/j/Library/Caches/pypoetry/virtualenvs/ucsinfer2-yBtBMMP2-py3.13/lib/python3.13/site-packages/torch/nn/modules/module.py:1762: FutureWarning: `encoder_attention_mask` is deprecated and will be removed in version 4.55.0 for `BertSdpaSelfAttention.forward`.
  return forward_call(*args, **kwargs)
In [3]:
# def cosine_similarity(a, b):
#     return np.dot(a, b) / (norm(a) * norm(b))

def classify_text(text, categories):
    text_embedding = model.encode(text, convert_to_numpy=True)

    print(f"Text: {text}")
    sim = model.similarity(text_embedding, [info['embedding'] for info in categories])
    print(sim)
    maxind = np.argmax(sim)
    print(f" -> Category: {categories[maxind]['name']}")
In [4]:
text1 = "I took my Winchester to the shooting range yesterday."
text2 = "I bought a new Mazda with an automatic transmission."
text3 = "My old Winchester hard drive finally failed."
text4 = "Keys clicking, typing"


for text in [text1, text2, text3, text4]:
    classify_text(text, categories)
Text: I took my Winchester to the shooting range yesterday.
tensor([[-0.0456,  0.3874,  0.0935]])
 -> Category: Firearms
Text: I bought a new Mazda with an automatic transmission.
tensor([[0.3483, 0.0454, 0.0285]])
 -> Category: Automobile
/Users/j/Library/Caches/pypoetry/virtualenvs/ucsinfer2-yBtBMMP2-py3.13/lib/python3.13/site-packages/sentence_transformers/util.py:55: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_new.cpp:257.)
  a = torch.tensor(a)
Text: My old Winchester hard drive finally failed.
tensor([[-0.0305,  0.2024,  0.3047]])
 -> Category: Computers
Text: Keys clicking, typing
tensor([[0.0957, 0.1107, 0.1531]])
 -> Category: Computers
In [ ]: