-
Eelco van der Wel authorede621cede
import numpy as np
from hdbscan import HDBSCAN
from umap import UMAP
def compute_umap(embeddings: np.ndarray) -> np.ndarray:
umap = UMAP(
n_neighbors=15,
n_components=5,
min_dist=0.0,
metric='cosine',
low_memory=False,
random_state=42,
)
return umap.fit_transform(embeddings)
def cluster_embeddings(embeddings: np.ndarray, min_topic_size: int) -> np.ndarray:
hdbscan = HDBSCAN(
min_cluster_size=min_topic_size,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=False,
)
return hdbscan.fit_predict(embeddings)