Dedupe Example

from oagdedupe.settings import (
    Settings,
    SettingsModel,
    SettingsDB,
    SettingsLabelStudio,
    SettingsService
)
from oagdedupe.api import Dedupe

import glob
import pandas as pd
pd.options.display.precision = 12
from sqlalchemy import create_engine
engine = create_engine("postgresql+psycopg2://username:password@0.0.0.0:8000/db")

# %%
settings = settings = Settings(
    attributes=["givenname", "surname", "suburb", "postcode"]
)

# %%
files = glob.glob(
    "/mnt/Research.CF/References & Training/Satchel/dedupe_rl/baseline_datasets/north_carolina_voters/*"
)[:2]
df = pd.concat([pd.read_csv(f) for f in files]).reset_index(drop=True)
for attr in settings.attributes:
    df[attr] = df[attr].astype(str)

# %%
d = Dedupe(settings=settings)
d.initialize(df=df, reset=True)

# %%
d.fit_blocks()
res = d.predict()