Commit f51251dd authored by Alp Deniz Ogut's avatar Alp Deniz Ogut
Browse files

Update for new suggestion type & tune tests

parent 9657d520
Showing with 58 additions and 57 deletions
+58 -57
......@@ -4,7 +4,7 @@
# TODO: Consider how to achieve continuous weight optimization (NN training)
# TODO: Add weight types, e.g. exact and distance
DEFAULT_SIMILARITY_SCORE_WEIGHTS = {'email': 0.75, 'hasPhoneNumber': 0.75, 'address': 0.4, 'displayName': 0.25,
DEFAULT_SIMILARITY_SCORE_WEIGHTS = {'email': 0.7, 'hasPhoneNumber': 0.7, 'address': 0.4, 'displayName': 0.25,
'birthDate': 0.25, 'firstName': 0.15, 'lastName': 0.25, 'gender': 0.05, 'empty_field_score': 0.05, 'default_weight': 0.05}
# Score threshold for each action
DEFAULT_SIMILARITY_CORE_THRESHOLDS = { 'merge': 1.0, 'suggest': 0.8, 'min': 0.8 }
\ No newline at end of file
......@@ -7,7 +7,7 @@ ID_KEY = 'id'
ITEM_KEY = 'item'
SCORE_KEY = 'score'
EXACT_SIMILARITY_SCORE = 10.0
SIGNIFICANT_WEIGHT_THRESHOLD = 0.25
SIGNIFICANT_WEIGHT_THRESHOLD = 0.24
class Merger:
......
......@@ -15,7 +15,7 @@ DEFAULT_WEIGHTS = {
}
LSH_LIST_JACCARD_THRESHOLD = 0.1 # required minimum list similarity index to pass a LSH filter
LSH_STRING_DISTANCE_THRESHOLD = 1.0 # required minimum string similarity to pass a LSH filter
RELU_LEAK = 0.3
RELU_LEAK = 0.1
MONTHS_IN_SECONDS = 30 * 24 * 60 * 60
......@@ -52,20 +52,25 @@ def list_distance(list_1, list_2):
if item_1 == item_2: # can use string_distance here
list_score += 1
# Apply jaccard similarity on two lists. score by "intersection over union"
jaccard_similarity = list_score / (len(list_1) + len(list_2) - list_score)
return jaccard_similarity
### activation functions ###
def relu(x):
if x > DEFAULT_WEIGHTS[STRING_ACTIVATION_THRESHOLD_KEY]:
def relu(x, threshold=None):
if threshold is None:
threshold = DEFAULT_WEIGHTS[STRING_ACTIVATION_THRESHOLD_KEY]
if x >= threshold:
return x
else:
return 0
def leaky_relu(x):
if x > DEFAULT_WEIGHTS[STRING_ACTIVATION_THRESHOLD_KEY]:
def leaky_relu(x, threshold=None):
if threshold is None:
threshold = DEFAULT_WEIGHTS[STRING_ACTIVATION_THRESHOLD_KEY]
if x >= threshold:
return x
else:
return RELU_LEAK * x
......@@ -101,7 +106,7 @@ def get_candidate_pairs(pairs, significant_keys):
# List items
if isinstance(val_1, list):
jaccard_index = list_distance(val_1, val_2)
if jaccard_index > LSH_LIST_JACCARD_THRESHOLD:
if jaccard_index >= LSH_LIST_JACCARD_THRESHOLD:
candidate_pairs.append((pair[0], pair[1]))
break
......@@ -164,13 +169,12 @@ def calculate_pair_score(pair, score_weights):
if isinstance(val_1, list):
# jaccard index of two lists
score += list_distance(val_1, val_2) * weight
else:
# levenshtein-based string distance
distance = string_distance(val_1, val_2)
# This threshold is similar to activation function: Do not get score by random similarities
score += leaky_relu(distance) * weight
score += leaky_relu(distance, score_weights[STRING_ACTIVATION_THRESHOLD_KEY]) * weight
return score
from person_deduplication.data.schema import SuggestedMerge
from pymemri.data.itembase import *
from pymemri.indexers.indexer import *
from person_deduplication.merger.merger import Merger
from person_deduplication.merger.merger import ID_KEY, Merger
from person_deduplication.adaptors.memri import MemriAdaptor
from person_deduplication.config import DEFAULT_SIMILARITY_CORE_THRESHOLDS, DEFAULT_SIMILARITY_SCORE_WEIGHTS
SCORE_KEY = 'score'
ACTION_KEY = 'action'
DEFAULT_ACTION = 'idle'
TYPE_ATTRIBUTE = 'type'
SUGGESTION_TYPE_KEY = 'SuggestedMerge'
MERGED_FROM_KEY = 'mergedFrom'
MERGE_FROM_KEY = 'mergeFrom'
class PersonDeduplicationIndexer(IndexerBase):
......@@ -45,7 +49,7 @@ class PersonDeduplicationIndexer(IndexerBase):
lower_confidence_list = merger.extract_suggestions(score_thresholds['suggest'], score_thresholds['merge'])
print(f"Applying {len(high_confidence_list)} and suggesting {len(lower_confidence_list)} merges")
# get resulting modified items
dict_items = self._apply_merges(high_confidence_list)
dict_items += self._apply_suggestions(lower_confidence_list)
......@@ -91,14 +95,15 @@ class PersonDeduplicationIndexer(IndexerBase):
if True in [i['item']['deleted'] for i in to_merge]:
continue
extra_properties = {
suggested_merge_item = {
ID_KEY: None,
TYPE_ATTRIBUTE: SUGGESTION_TYPE_KEY,
SCORE_KEY: min([i[SCORE_KEY] for i in to_merge]), # minimum confidence score in list
TYPE_ATTRIBUTE : SUGGESTION_TYPE_KEY + to_merge[0]['item'][TYPE_ATTRIBUTE],
MERGED_FROM_KEY : [f"{i['item'][TYPE_ATTRIBUTE]}:{i['item']['id']}" for i in to_merge]
ACTION_KEY: DEFAULT_ACTION,
MERGE_FROM_KEY: [f"{i['item'][TYPE_ATTRIBUTE]}:{i['item']['id']}" for i in to_merge]
}
new_item = Merger.create_merged_item([i['item'] for i in to_merge], extra_properties)
# mark suggested merge item for update
items_to_process.append(new_item)
items_to_process.append(suggested_merge_item)
return items_to_process
......
......@@ -13,9 +13,9 @@ WEIGHTS = {
}
THRESHOLDS = {
'merge': 0.99,
'suggest': 0.8,
'min': 0.8
'merge': 1.0,
'suggest': 0.7,
'min': 0.7
}
ITEMS = [
......
......@@ -2,12 +2,14 @@ from unittest import TestCase
from pymemri.pod.client import PodClient
from person_deduplication.person_deduplication import PersonDeduplicationIndexer
from pymemri.data.schema import Person, PhoneNumber
from pymemri.data.schema import Account, PhoneNumber
from pymemri.indexers.indexer import IndexerData, IndexerRun
from person_deduplication.data.schema import SuggestedMergePerson, Weight, Threshold
from person_deduplication.data.schema import Person, SuggestedMerge, Weight, Threshold
from person_deduplication.config import DEFAULT_SIMILARITY_CORE_THRESHOLDS, DEFAULT_SIMILARITY_SCORE_WEIGHTS
from random import randint
def generate_test_data():
......@@ -61,11 +63,22 @@ def generate_test_data():
def generate_test_data_2():
phone_numbers = []
with open('./data/phone_numbers.csv', 'r') as f:
for line in f:
id, number = line.strip().split(',')
phone_numbers.append(PhoneNumber(id=id, phoneNumber=number))
people = []
with open('./data/people.csv', 'r') as f:
for line in f:
id, firstName, lastName, email, gender, displayName, birthDate = [v if v != 'None' else None for v in line.strip().split(',')]
p = Person(id=id, firstName=firstName, lastName=lastName, email=email, gender=gender, displayName=displayName, birthDate=birthDate)
# if randint(0,5) == 2:
# p.add_edge('hasPhoneNumber', phone_numbers[randint(0, len(phone_numbers)-1)])
# for i in range(2):
# p.add_edge('account', Account(id=f"{id}00000{i}", externalId=f"30000{id}{i}", displayName=displayName, service="whatsapp"))
people.append(p)
return people
......@@ -73,17 +86,20 @@ def generate_test_data_2():
class PersonSimilarityTestCase(TestCase):
client = PodClient(database_key="abc", owner_key="123")
client = PodClient(database_key="3169095052551985645422890382713954115829390594101505849463195193", owner_key="73191636085796312701326009639994440194633324")
def testSimilarityScores(self):
# make sure schemas exist
self.client.add_to_schema(Person(firstName="", lastName="", email="", gender="", birthDate="", birthPlace=""))
self.client.add_to_schema(SuggestedMergePerson(score=""))
self.client.add_to_schema(Weight(name="", value="0.5", activation='linear', targetDataType=""))
self.client.add_to_schema(Person(firstName="", lastName="", email="", gender="", displayName="", birthDate=""))
self.client.add_to_schema(SuggestedMerge(score="0.9", task="idle", mergeFrom=[]))
self.client.add_to_schema(Weight(name="", value="0.6", activation='linear', targetDataType=""))
self.client.add_to_schema(Threshold(name="", value="0.9", targetDataType=""))
self.input = generate_test_data_2()
if not self.input[-1].exists(self.client):
for p in self.input:
self.client.create(p)
data = IndexerData(items=self.input, weights=DEFAULT_SIMILARITY_SCORE_WEIGHTS,
thresholds=DEFAULT_SIMILARITY_CORE_THRESHOLDS)
......@@ -93,35 +109,11 @@ class PersonSimilarityTestCase(TestCase):
items = indexer.index(data, indexer_run, self.client)
# indexer.populate(self.client, items, edges=True)
# Generated set 1
# 3 items to delete 1 item to merge, 1 item to suggest [+ one possible suggested merge onto the merged one -not yet added to db]
# Generated set 2
# 2 items to delete, 1 item to merge, 1 item to suggest
assert len(items) == 4
# Other checks for generated set 1
changes = list(items)
for change in changes:
if len(change.mergedFrom) == 3: # Merged item
# Check merged item ids
merged_ids = [change.mergedFrom[i].id for i in range(3)]
merged_ids.sort()
assert (merged_ids == ['100001','100002','100003'])
# Check if phone numbers are duplicated
assert len(change.hasPhoneNumber) == 2
# This is most probably not possible (suggesting a merge onto the recently merged one -not yet added to db)
# if len(change.mergedFrom) == 2 and change.email == 'bobgunther@email.com':
# # Check if it is merged on the recently built item
# assert change.mergedFrom[0].id == None
# # Check if lastName is placed
# assert change.lastName == 'Gunther'
if len(change.mergedFrom) == 2 and change.email == 'jane@othermail.com': # Suggested merged item
# Check merge suggestion item ids
merged_ids = [change.mergedFrom[i].id for i in range(2)]
merged_ids.sort()
assert (merged_ids == ['100006','100007'])
# TODO: Well-define expected outcomes
assert len(items) == 12
......@@ -80,6 +80,6 @@ class SimilarityTestCase(TestCase):
score_2 = similarity.calculate_pair_score((self.object_2, self.object_3), weights)
score_3 = similarity.calculate_pair_score((self.object_3, self.object_4), weights)
assert score_1 == 0.4 # 1.0 * 0.4
assert score_2 == 0.05 # one sided empty field score
assert round(100*score_3)/100 == 0.15 # 0.5 * 0.2 + empty field score
\ No newline at end of file
assert round(score_1 * 10000) == round(0.4042 * 10000) # 1.0 * 0.4 + 0.21 * 0.2 * 0.2 (key_1 match + key_2 leaky_relu)
assert round(score_2 * 10000) == round(0.0566 * 10000) # one sided empty field score (0.05) + leaky_relu of key_1, key_2 (0.11 * 0.2 * (0.2+0.4))
assert round(100*score_3)/100 == 0.16 # 0.5 * 0.2 + empty field score (0.05) + leaky_relu of key_1
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment