Source code for oagdedupe.cluster.cluster

from dataclasses import dataclass
from typing import List, Union

import networkx as nx
import pandas as pd

from oagdedupe import utils as du
from oagdedupe.base import BaseCluster
from oagdedupe.db.base import BaseRepository
from oagdedupe.settings import Settings


[docs]@dataclass class ConnectedComponents(BaseCluster): """ Uses a graph to retrieve connected components """ repo: BaseRepository settings: Settings @du.recordlinkage def get_df_cluster( self, threshold: float = 0.8, rl: str = "" ) -> Union[pd.DataFrame, List[pd.DataFrame]]: """ Convert connected components to dataframe for user friendly output Parameters ---------- threshold: float pairs below this score are not considered for clustering Returns ---------- pd.DataFrame clusters merged with raw data """ scores = self.repo.get_scores(threshold=threshold) df_clusters = getattr(self, f"get_connected_components{rl}")(scores) return self.repo.merge_clusters_with_raw_data( df_clusters=df_clusters, rl=rl )
[docs] def get_connected_components(self, scores: pd.DataFrame) -> pd.DataFrame: """ Build graph with "matched" candidate pairs, weighted by p(match). Need to add feature to consider weights when generating connected components. Parameters ---------- scores: pd.DataFrame dataframe with pair indices and match scores Returns ---------- pd.DataFrame dataframe mapping cluster index to entity index """ g = nx.Graph() g.add_weighted_edges_from( [ tuple( [ f"{score['_index_l']}", f"{score['_index_r']}", score["score"], ] ) for score in scores.to_dict(orient="records") ] ) conn_comp = list(nx.connected_components(g)) clusters = [ {"cluster": clusteridx, "_index": int(rec_id), "_type": None} for clusteridx, cluster in enumerate(conn_comp) for rec_id in cluster ] return pd.DataFrame(clusters)