Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Memri
plugins
Person deduplication
Commits
f51251dd
Commit
f51251dd
authored
4 years ago
by
Alp Deniz Ogut
Browse files
Options
Download
Email Patches
Plain Diff
Update for new suggestion type & tune tests
parent
9657d520
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
person_deduplication/config.py
+1
-1
person_deduplication/config.py
person_deduplication/merger/merger.py
+1
-1
person_deduplication/merger/merger.py
person_deduplication/merger/similarity.py
+12
-8
person_deduplication/merger/similarity.py
person_deduplication/person_deduplication.py
+12
-7
person_deduplication/person_deduplication.py
tests/merger.py
+3
-3
tests/merger.py
tests/person_deduplication.py
+26
-34
tests/person_deduplication.py
tests/similarity.py
+3
-3
tests/similarity.py
with
58 additions
and
57 deletions
+58
-57
person_deduplication/config.py
+
1
-
1
View file @
f51251dd
...
...
@@ -4,7 +4,7 @@
# TODO: Consider how to achieve continuous weight optimization (NN training)
# TODO: Add weight types, e.g. exact and distance
DEFAULT_SIMILARITY_SCORE_WEIGHTS
=
{
'email'
:
0.7
5
,
'hasPhoneNumber'
:
0.7
5
,
'address'
:
0.4
,
'displayName'
:
0.25
,
DEFAULT_SIMILARITY_SCORE_WEIGHTS
=
{
'email'
:
0.7
,
'hasPhoneNumber'
:
0.7
,
'address'
:
0.4
,
'displayName'
:
0.25
,
'birthDate'
:
0.25
,
'firstName'
:
0.15
,
'lastName'
:
0.25
,
'gender'
:
0.05
,
'empty_field_score'
:
0.05
,
'default_weight'
:
0.05
}
# Score threshold for each action
DEFAULT_SIMILARITY_CORE_THRESHOLDS
=
{
'merge'
:
1.0
,
'suggest'
:
0.8
,
'min'
:
0.8
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
person_deduplication/merger/merger.py
+
1
-
1
View file @
f51251dd
...
...
@@ -7,7 +7,7 @@ ID_KEY = 'id'
ITEM_KEY
=
'item'
SCORE_KEY
=
'score'
EXACT_SIMILARITY_SCORE
=
10.0
SIGNIFICANT_WEIGHT_THRESHOLD
=
0.2
5
SIGNIFICANT_WEIGHT_THRESHOLD
=
0.2
4
class
Merger
:
...
...
This diff is collapsed.
Click to expand it.
person_deduplication/merger/similarity.py
+
12
-
8
View file @
f51251dd
...
...
@@ -15,7 +15,7 @@ DEFAULT_WEIGHTS = {
}
LSH_LIST_JACCARD_THRESHOLD
=
0.1
# required minimum list similarity index to pass a LSH filter
LSH_STRING_DISTANCE_THRESHOLD
=
1.0
# required minimum string similarity to pass a LSH filter
RELU_LEAK
=
0.
3
RELU_LEAK
=
0.
1
MONTHS_IN_SECONDS
=
30
*
24
*
60
*
60
...
...
@@ -52,20 +52,25 @@ def list_distance(list_1, list_2):
if
item_1
==
item_2
:
# can use string_distance here
list_score
+=
1
# Apply jaccard similarity on two lists. score by "intersection over union"
jaccard_similarity
=
list_score
/
(
len
(
list_1
)
+
len
(
list_2
)
-
list_score
)
return
jaccard_similarity
### activation functions ###
def
relu
(
x
):
if
x
>
DEFAULT_WEIGHTS
[
STRING_ACTIVATION_THRESHOLD_KEY
]:
def
relu
(
x
,
threshold
=
None
):
if
threshold
is
None
:
threshold
=
DEFAULT_WEIGHTS
[
STRING_ACTIVATION_THRESHOLD_KEY
]
if
x
>=
threshold
:
return
x
else
:
return
0
def
leaky_relu
(
x
):
if
x
>
DEFAULT_WEIGHTS
[
STRING_ACTIVATION_THRESHOLD_KEY
]:
def
leaky_relu
(
x
,
threshold
=
None
):
if
threshold
is
None
:
threshold
=
DEFAULT_WEIGHTS
[
STRING_ACTIVATION_THRESHOLD_KEY
]
if
x
>=
threshold
:
return
x
else
:
return
RELU_LEAK
*
x
...
...
@@ -101,7 +106,7 @@ def get_candidate_pairs(pairs, significant_keys):
# List items
if
isinstance
(
val_1
,
list
):
jaccard_index
=
list_distance
(
val_1
,
val_2
)
if
jaccard_index
>
LSH_LIST_JACCARD_THRESHOLD
:
if
jaccard_index
>
=
LSH_LIST_JACCARD_THRESHOLD
:
candidate_pairs
.
append
((
pair
[
0
],
pair
[
1
]))
break
...
...
@@ -164,13 +169,12 @@ def calculate_pair_score(pair, score_weights):
if
isinstance
(
val_1
,
list
):
# jaccard index of two lists
score
+=
list_distance
(
val_1
,
val_2
)
*
weight
else
:
# levenshtein-based string distance
distance
=
string_distance
(
val_1
,
val_2
)
# This threshold is similar to activation function: Do not get score by random similarities
score
+=
leaky_relu
(
distance
)
*
weight
score
+=
leaky_relu
(
distance
,
score_weights
[
STRING_ACTIVATION_THRESHOLD_KEY
]
)
*
weight
return
score
This diff is collapsed.
Click to expand it.
person_deduplication/person_deduplication.py
+
12
-
7
View file @
f51251dd
from
person_deduplication.data.schema
import
SuggestedMerge
from
pymemri.data.itembase
import
*
from
pymemri.indexers.indexer
import
*
from
person_deduplication.merger.merger
import
Merger
from
person_deduplication.merger.merger
import
ID_KEY
,
Merger
from
person_deduplication.adaptors.memri
import
MemriAdaptor
from
person_deduplication.config
import
DEFAULT_SIMILARITY_CORE_THRESHOLDS
,
DEFAULT_SIMILARITY_SCORE_WEIGHTS
SCORE_KEY
=
'score'
ACTION_KEY
=
'action'
DEFAULT_ACTION
=
'idle'
TYPE_ATTRIBUTE
=
'type'
SUGGESTION_TYPE_KEY
=
'SuggestedMerge'
MERGED_FROM_KEY
=
'mergedFrom'
MERGE_FROM_KEY
=
'mergeFrom'
class
PersonDeduplicationIndexer
(
IndexerBase
):
...
...
@@ -45,7 +49,7 @@ class PersonDeduplicationIndexer(IndexerBase):
lower_confidence_list
=
merger
.
extract_suggestions
(
score_thresholds
[
'suggest'
],
score_thresholds
[
'merge'
])
print
(
f
"Applying
{
len
(
high_confidence_list
)
}
and suggesting
{
len
(
lower_confidence_list
)
}
merges"
)
# get resulting modified items
dict_items
=
self
.
_apply_merges
(
high_confidence_list
)
dict_items
+=
self
.
_apply_suggestions
(
lower_confidence_list
)
...
...
@@ -91,14 +95,15 @@ class PersonDeduplicationIndexer(IndexerBase):
if
True
in
[
i
[
'item'
][
'deleted'
]
for
i
in
to_merge
]:
continue
extra_properties
=
{
suggested_merge_item
=
{
ID_KEY
:
None
,
TYPE_ATTRIBUTE
:
SUGGESTION_TYPE_KEY
,
SCORE_KEY
:
min
([
i
[
SCORE_KEY
]
for
i
in
to_merge
]),
# minimum confidence score in list
TYPE_ATTRIBUTE
:
SUGGESTION_TYPE_KEY
+
to_merge
[
0
][
'item'
][
TYPE_ATTRIBUTE
]
,
MERGE
D
_FROM_KEY
:
[
f
"
{
i
[
'item'
][
TYPE_ATTRIBUTE
]
}
:
{
i
[
'item'
][
'id'
]
}
"
for
i
in
to_merge
]
ACTION_KEY
:
DEFAULT_ACTION
,
MERGE_FROM_KEY
:
[
f
"
{
i
[
'item'
][
TYPE_ATTRIBUTE
]
}
:
{
i
[
'item'
][
'id'
]
}
"
for
i
in
to_merge
]
}
new_item
=
Merger
.
create_merged_item
([
i
[
'item'
]
for
i
in
to_merge
],
extra_properties
)
# mark suggested merge item for update
items_to_process
.
append
(
new
_item
)
items_to_process
.
append
(
suggested_merge
_item
)
return
items_to_process
...
...
This diff is collapsed.
Click to expand it.
tests/merger.py
+
3
-
3
View file @
f51251dd
...
...
@@ -13,9 +13,9 @@ WEIGHTS = {
}
THRESHOLDS
=
{
'merge'
:
0.99
,
'suggest'
:
0.
8
,
'min'
:
0.
8
'merge'
:
1.0
,
'suggest'
:
0.
7
,
'min'
:
0.
7
}
ITEMS
=
[
...
...
This diff is collapsed.
Click to expand it.
tests/person_deduplication.py
+
26
-
34
View file @
f51251dd
...
...
@@ -2,12 +2,14 @@ from unittest import TestCase
from
pymemri.pod.client
import
PodClient
from
person_deduplication.person_deduplication
import
PersonDeduplicationIndexer
from
pymemri.data.schema
import
Person
,
PhoneNumber
from
pymemri.data.schema
import
Account
,
PhoneNumber
from
pymemri.indexers.indexer
import
IndexerData
,
IndexerRun
from
person_deduplication.data.schema
import
SuggestedMerge
Person
,
Weight
,
Threshold
from
person_deduplication.data.schema
import
Person
,
SuggestedMerge
,
Weight
,
Threshold
from
person_deduplication.config
import
DEFAULT_SIMILARITY_CORE_THRESHOLDS
,
DEFAULT_SIMILARITY_SCORE_WEIGHTS
from
random
import
randint
def
generate_test_data
():
...
...
@@ -61,11 +63,22 @@ def generate_test_data():
def
generate_test_data_2
():
phone_numbers
=
[]
with
open
(
'./data/phone_numbers.csv'
,
'r'
)
as
f
:
for
line
in
f
:
id
,
number
=
line
.
strip
().
split
(
','
)
phone_numbers
.
append
(
PhoneNumber
(
id
=
id
,
phoneNumber
=
number
))
people
=
[]
with
open
(
'./data/people.csv'
,
'r'
)
as
f
:
for
line
in
f
:
id
,
firstName
,
lastName
,
email
,
gender
,
displayName
,
birthDate
=
[
v
if
v
!=
'None'
else
None
for
v
in
line
.
strip
().
split
(
','
)]
p
=
Person
(
id
=
id
,
firstName
=
firstName
,
lastName
=
lastName
,
email
=
email
,
gender
=
gender
,
displayName
=
displayName
,
birthDate
=
birthDate
)
# if randint(0,5) == 2:
# p.add_edge('hasPhoneNumber', phone_numbers[randint(0, len(phone_numbers)-1)])
# for i in range(2):
# p.add_edge('account', Account(id=f"{id}00000{i}", externalId=f"30000{id}{i}", displayName=displayName, service="whatsapp"))
people
.
append
(
p
)
return
people
...
...
@@ -73,17 +86,20 @@ def generate_test_data_2():
class
PersonSimilarityTestCase
(
TestCase
):
client
=
PodClient
(
database_key
=
"
abc"
,
owner_key
=
"123
"
)
client
=
PodClient
(
database_key
=
"
3169095052551985645422890382713954115829390594101505849463195193"
,
owner_key
=
"73191636085796312701326009639994440194633324
"
)
def
testSimilarityScores
(
self
):
# make sure schemas exist
self
.
client
.
add_to_schema
(
Person
(
firstName
=
""
,
lastName
=
""
,
email
=
""
,
gender
=
""
,
birthDat
e
=
""
,
birth
Plac
e
=
""
))
self
.
client
.
add_to_schema
(
SuggestedMerge
Person
(
score
=
"
"
))
self
.
client
.
add_to_schema
(
Weight
(
name
=
""
,
value
=
"0.
5
"
,
activation
=
'linear'
,
targetDataType
=
""
))
self
.
client
.
add_to_schema
(
Person
(
firstName
=
""
,
lastName
=
""
,
email
=
""
,
gender
=
""
,
displayNam
e
=
""
,
birth
Dat
e
=
""
))
self
.
client
.
add_to_schema
(
SuggestedMerge
(
score
=
"
0.9"
,
task
=
"idle"
,
mergeFrom
=
[]
))
self
.
client
.
add_to_schema
(
Weight
(
name
=
""
,
value
=
"0.
6
"
,
activation
=
'linear'
,
targetDataType
=
""
))
self
.
client
.
add_to_schema
(
Threshold
(
name
=
""
,
value
=
"0.9"
,
targetDataType
=
""
))
self
.
input
=
generate_test_data_2
()
if
not
self
.
input
[
-
1
].
exists
(
self
.
client
):
for
p
in
self
.
input
:
self
.
client
.
create
(
p
)
data
=
IndexerData
(
items
=
self
.
input
,
weights
=
DEFAULT_SIMILARITY_SCORE_WEIGHTS
,
thresholds
=
DEFAULT_SIMILARITY_CORE_THRESHOLDS
)
...
...
@@ -93,35 +109,11 @@ class PersonSimilarityTestCase(TestCase):
items
=
indexer
.
index
(
data
,
indexer_run
,
self
.
client
)
# indexer.populate(self.client, items, edges=True)
# Generated set 1
# 3 items to delete 1 item to merge, 1 item to suggest [+ one possible suggested merge onto the merged one -not yet added to db]
# Generated set 2
# 2 items to delete, 1 item to merge, 1 item to suggest
assert
len
(
items
)
==
4
# Other checks for generated set 1
changes
=
list
(
items
)
for
change
in
changes
:
if
len
(
change
.
mergedFrom
)
==
3
:
# Merged item
# Check merged item ids
merged_ids
=
[
change
.
mergedFrom
[
i
].
id
for
i
in
range
(
3
)]
merged_ids
.
sort
()
assert
(
merged_ids
==
[
'100001'
,
'100002'
,
'100003'
])
# Check if phone numbers are duplicated
assert
len
(
change
.
hasPhoneNumber
)
==
2
# This is most probably not possible (suggesting a merge onto the recently merged one -not yet added to db)
# if len(change.mergedFrom) == 2 and change.email == 'bobgunther@email.com':
# # Check if it is merged on the recently built item
# assert change.mergedFrom[0].id == None
# # Check if lastName is placed
# assert change.lastName == 'Gunther'
if
len
(
change
.
mergedFrom
)
==
2
and
change
.
email
==
'jane@othermail.com'
:
# Suggested merged item
# Check merge suggestion item ids
merged_ids
=
[
change
.
mergedFrom
[
i
].
id
for
i
in
range
(
2
)]
merged_ids
.
sort
()
assert
(
merged_ids
==
[
'100006'
,
'100007'
])
# TODO: Well-define expected outcomes
assert
len
(
items
)
==
12
This diff is collapsed.
Click to expand it.
tests/similarity.py
+
3
-
3
View file @
f51251dd
...
...
@@ -80,6 +80,6 @@ class SimilarityTestCase(TestCase):
score_2
=
similarity
.
calculate_pair_score
((
self
.
object_2
,
self
.
object_3
),
weights
)
score_3
=
similarity
.
calculate_pair_score
((
self
.
object_3
,
self
.
object_4
),
weights
)
assert
score_1
==
0.4
# 1.0 * 0.4
assert
score_2
==
0.05
# one sided empty field score
assert
round
(
100
*
score_3
)
/
100
==
0.15
# 0.5 * 0.2 + empty field score
\ No newline at end of file
assert
round
(
score_1
*
10000
)
==
round
(
0.4042
*
10000
)
# 1.0 * 0.4 + 0.21 * 0.2 * 0.2 (key_1 match + key_2 leaky_relu)
assert
round
(
score_2
*
10000
)
==
round
(
0.0566
*
10000
)
# one sided empty field score (0.05) + leaky_relu of key_1, key_2 (0.11 * 0.2 * (0.2+0.4))
assert
round
(
100
*
score_3
)
/
100
==
0.16
# 0.5 * 0.2 + empty field score (0.05) + leaky_relu of key_1
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment