78 lines
2.1 KiB
Python
78 lines
2.1 KiB
Python
"""Similarity detection for registry tools."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from difflib import SequenceMatcher
|
|
from typing import Dict, List, Tuple
|
|
|
|
|
|
def _tokenize(text: str) -> List[str]:
|
|
return [t for t in re_split_nonword(text.lower()) if t]
|
|
|
|
|
|
def re_split_nonword(text: str) -> List[str]:
|
|
token = ""
|
|
tokens = []
|
|
for ch in text:
|
|
if ch.isalnum():
|
|
token += ch
|
|
else:
|
|
if token:
|
|
tokens.append(token)
|
|
token = ""
|
|
if token:
|
|
tokens.append(token)
|
|
return tokens
|
|
|
|
|
|
def jaccard(a: List[str], b: List[str]) -> float:
|
|
set_a = set(a)
|
|
set_b = set(b)
|
|
if not set_a and not set_b:
|
|
return 0.0
|
|
return len(set_a & set_b) / max(len(set_a | set_b), 1)
|
|
|
|
|
|
def name_similarity(name_a: str, name_b: str) -> float:
|
|
return SequenceMatcher(None, name_a.lower(), name_b.lower()).ratio()
|
|
|
|
|
|
def description_similarity(desc_a: str, desc_b: str) -> float:
|
|
return jaccard(_tokenize(desc_a), _tokenize(desc_b))
|
|
|
|
|
|
def tags_similarity(tags_a: List[str], tags_b: List[str]) -> float:
|
|
return jaccard([t.lower() for t in tags_a], [t.lower() for t in tags_b])
|
|
|
|
|
|
def score_similarity(
|
|
candidate: Dict,
|
|
name: str,
|
|
description: str,
|
|
tags: List[str],
|
|
category: str | None,
|
|
) -> float:
|
|
name_score = name_similarity(name, candidate.get("name", ""))
|
|
desc_score = description_similarity(description, candidate.get("description", ""))
|
|
tags_score = tags_similarity(tags, candidate.get("tags", []))
|
|
category_bonus = 0.1 if category and candidate.get("category") == category else 0.0
|
|
score = (0.5 * name_score) + (0.3 * desc_score) + (0.2 * tags_score) + category_bonus
|
|
return min(score, 1.0)
|
|
|
|
|
|
def find_similar_tools(
|
|
tools: List[Dict],
|
|
name: str,
|
|
description: str,
|
|
tags: List[str],
|
|
category: str | None,
|
|
threshold: float = 0.6,
|
|
) -> List[Tuple[Dict, float]]:
|
|
results = []
|
|
for tool in tools:
|
|
score = score_similarity(tool, name, description, tags, category)
|
|
if score >= threshold:
|
|
results.append((tool, round(score, 3)))
|
|
results.sort(key=lambda item: item[1], reverse=True)
|
|
return results
|