clustering.py 636 bytes
import numpy as np
from hdbscan import HDBSCAN
from umap import UMAP
def compute_umap(embeddings: np.ndarray) -> np.ndarray:
    umap = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric='cosine',
        low_memory=False,
        random_state=42,
    return umap.fit_transform(embeddings)  
def cluster_embeddings(embeddings: np.ndarray, min_topic_size: int) -> np.ndarray:
    hdbscan = HDBSCAN(
        min_cluster_size=min_topic_size,
        metric='euclidean',
        cluster_selection_method='eom',
        prediction_data=False,
    return hdbscan.fit_predict(embeddings)