orchestrated-discussions/.venv/lib/python3.12/site-packages/cmdforge/registry/similarity.py

78 lines
2.1 KiB
Python

"""Similarity detection for registry tools."""
from __future__ import annotations
from difflib import SequenceMatcher
from typing import Dict, List, Tuple
def _tokenize(text: str) -> List[str]:
return [t for t in re_split_nonword(text.lower()) if t]
def re_split_nonword(text: str) -> List[str]:
token = ""
tokens = []
for ch in text:
if ch.isalnum():
token += ch
else:
if token:
tokens.append(token)
token = ""
if token:
tokens.append(token)
return tokens
def jaccard(a: List[str], b: List[str]) -> float:
set_a = set(a)
set_b = set(b)
if not set_a and not set_b:
return 0.0
return len(set_a & set_b) / max(len(set_a | set_b), 1)
def name_similarity(name_a: str, name_b: str) -> float:
return SequenceMatcher(None, name_a.lower(), name_b.lower()).ratio()
def description_similarity(desc_a: str, desc_b: str) -> float:
return jaccard(_tokenize(desc_a), _tokenize(desc_b))
def tags_similarity(tags_a: List[str], tags_b: List[str]) -> float:
return jaccard([t.lower() for t in tags_a], [t.lower() for t in tags_b])
def score_similarity(
candidate: Dict,
name: str,
description: str,
tags: List[str],
category: str | None,
) -> float:
name_score = name_similarity(name, candidate.get("name", ""))
desc_score = description_similarity(description, candidate.get("description", ""))
tags_score = tags_similarity(tags, candidate.get("tags", []))
category_bonus = 0.1 if category and candidate.get("category") == category else 0.0
score = (0.5 * name_score) + (0.3 * desc_score) + (0.2 * tags_score) + category_bonus
return min(score, 1.0)
def find_similar_tools(
tools: List[Dict],
name: str,
description: str,
tags: List[str],
category: str | None,
threshold: float = 0.6,
) -> List[Tuple[Dict, float]]:
results = []
for tool in tools:
score = score_similarity(tool, name, description, tags, category)
if score >= threshold:
results.append((tool, round(score, 3)))
results.sort(key=lambda item: item[1], reverse=True)
return results