5.3 KiB
5.3 KiB
In [1]:
categories = [
{
"name": "Automobile",
"description": "Topics related to vehicles such as cars, trucks, and their brands.",
"keywords": ["Mazda", "Toyota", "SUV", "sedan", "pickup"]
},
{
"name": "Firearms",
"description": "Topics related to guns, rifles, pistols, ammunition, and other weapons.",
"keywords": ["Winchester", "Glock", "rifle", "bullet", "shotgun"]
},
{
"name": "Computers",
"description": "Topics involving computer hardware and software, such as hard drives, CPUs, and laptops.",
"keywords": ["Winchester", "CPU", "hard drive", "RAM", "SSD", "motherboard"]
},
]
In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
from numpy.linalg import norm
model = SentenceTransformer("all-MiniLM-L6-v2")
def build_category_embedding(cat_info):
components = [cat_info["name"], cat_info["description"]] + cat_info.get('keywords', [])
composite_text = ". ".join(components)
return model.encode(composite_text, convert_to_numpy=True)
# Embed all categories
for info in categories:
info['embedding'] = build_category_embedding(info)
In [3]:
# def cosine_similarity(a, b):
# return np.dot(a, b) / (norm(a) * norm(b))
def classify_text(text, categories):
text_embedding = model.encode(text, convert_to_numpy=True)
print(f"Text: {text}")
sim = model.similarity(text_embedding, [info['embedding'] for info in categories])
print(sim)
maxind = np.argmax(sim)
print(f" -> Category: {categories[maxind]['name']}")
In [4]:
text1 = "I took my Winchester to the shooting range yesterday."
text2 = "I bought a new Mazda with an automatic transmission."
text3 = "My old Winchester hard drive finally failed."
text4 = "Keys clicking, typing"
for text in [text1, text2, text3, text4]:
classify_text(text, categories)
In [ ]: