"""Similarity detection for registry tools.""" from __future__ import annotations from difflib import SequenceMatcher from typing import Dict, List, Tuple def _tokenize(text: str) -> List[str]: return [t for t in re_split_nonword(text.lower()) if t] def re_split_nonword(text: str) -> List[str]: token = "" tokens = [] for ch in text: if ch.isalnum(): token += ch else: if token: tokens.append(token) token = "" if token: tokens.append(token) return tokens def jaccard(a: List[str], b: List[str]) -> float: set_a = set(a) set_b = set(b) if not set_a and not set_b: return 0.0 return len(set_a & set_b) / max(len(set_a | set_b), 1) def name_similarity(name_a: str, name_b: str) -> float: return SequenceMatcher(None, name_a.lower(), name_b.lower()).ratio() def description_similarity(desc_a: str, desc_b: str) -> float: return jaccard(_tokenize(desc_a), _tokenize(desc_b)) def tags_similarity(tags_a: List[str], tags_b: List[str]) -> float: return jaccard([t.lower() for t in tags_a], [t.lower() for t in tags_b]) def score_similarity( candidate: Dict, name: str, description: str, tags: List[str], category: str | None, ) -> float: name_score = name_similarity(name, candidate.get("name", "")) desc_score = description_similarity(description, candidate.get("description", "")) tags_score = tags_similarity(tags, candidate.get("tags", [])) category_bonus = 0.1 if category and candidate.get("category") == category else 0.0 score = (0.5 * name_score) + (0.3 * desc_score) + (0.2 * tags_score) + category_bonus return min(score, 1.0) def find_similar_tools( tools: List[Dict], name: str, description: str, tags: List[str], category: str | None, threshold: float = 0.6, ) -> List[Tuple[Dict, float]]: results = [] for tool in tools: score = score_similarity(tool, name, description, tags, category) if score >= threshold: results.append((tool, round(score, 3))) results.sort(key=lambda item: item[1], reverse=True) return results