-
Notifications
You must be signed in to change notification settings - Fork 126
/
Copy pathngram.py
87 lines (73 loc) · 2.86 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Copyright (c) 2018 luozhouyang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from .string_distance import NormalizedStringDistance
class NGram(NormalizedStringDistance):
def __init__(self, n=2):
self.n = n
def distance(self, s0, s1):
if s0 is None:
raise TypeError("Argument s0 is NoneType.")
if s1 is None:
raise TypeError("Argument s1 is NoneType.")
if s0 == s1:
return 0.0
special = '\n'
sl = len(s0)
tl = len(s1)
if sl == 0 or tl == 0:
return 1.0
cost = 0
if sl < self.n or tl < self.n:
for i in range(min(sl, tl)):
if s0[i] == s1[i]:
cost += 1
return 1.0 - cost / max(sl, tl)
sa = [''] * (sl + self.n - 1)
for i in range(len(sa)):
if i < self.n - 1:
sa[i] = special
else:
sa[i] = s0[i - self.n + 1]
p = [0.0] * (sl + 1)
d = [0.0] * (sl + 1)
t_j = [''] * self.n
for i in range(sl + 1):
p[i] = 1.0 * i
for j in range(1, tl + 1):
if j < self.n:
for ti in range(self.n - j):
t_j[ti] = special
for ti in range(self.n - j, self.n):
t_j[ti] = s1[ti - (self.n - j)]
else:
t_j = s1[j - self.n:j]
d[0] = 1.0 * j
for i in range(sl + 1):
cost = 0
tn = self.n
for ni in range(self.n):
if sa[i - 1 + ni] != t_j[ni]:
cost += 1
elif sa[i - 1 + ni] == special:
tn -= 1
ec = cost / tn
d[i] = min(d[i - 1] + 1, p[i] + 1, p[i - 1] + ec)
p, d = d, p
return p[sl] / max(tl, sl)