Skip to content

Commit 9a10c9b

Browse files
committed
filter for repetition removal
1 parent 6e0261d commit 9a10c9b

2 files changed

Lines changed: 124 additions & 0 deletions

File tree

ac_dc/filtering.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
from os import stat
12
import re
23

4+
import numpy as np
5+
36
import fasttext
47

58
import sentencepiece
@@ -420,6 +423,36 @@ def check_number_words(
420423
)
421424
return cond
422425

426+
@staticmethod
427+
def compute_repetitions_ratio(document, repetitions_length):
428+
def get_freq_ngrams(document, n):
429+
ngrams = [document[i : i + n] for i in range(len(document) - n + 1)]
430+
freq_ngrams = {}
431+
for ngram in ngrams:
432+
freq_ngrams[ngram] = freq_ngrams.get(ngram, 0) + 1
433+
return freq_ngrams
434+
435+
freq_ngrams = get_freq_ngrams(document, repetitions_length)
436+
if len(freq_ngrams) == 0:
437+
return 0
438+
freq_ngrams = list(freq_ngrams.values())
439+
freq_ngrams = sorted(freq_ngrams, reverse=True)
440+
num_rep_ngrams = int(np.sqrt(len(freq_ngrams)))
441+
repetitions_ratio = sum(freq_ngrams[:num_rep_ngrams]) / sum(freq_ngrams)
442+
return repetitions_ratio
443+
444+
@staticmethod
445+
def check_repetitions_removal(
446+
document,
447+
repetitions_length,
448+
repetitions_max_cutoff,
449+
):
450+
repetitions_ratio = Filtering.compute_repetitions_ratio(
451+
document, repetitions_length
452+
)
453+
cond = repetitions_ratio <= repetitions_max_cutoff
454+
return cond
455+
423456
@staticmethod
424457
def compute_special_characters_ratio(document, special_characters):
425458
special_characters_ratio = len(
@@ -639,6 +672,9 @@ def filtering(
639672
strip_characters,
640673
number_words_min_cutoff,
641674
number_words_max_cutoff,
675+
cond_check_repetitions_removal,
676+
repetitions_length,
677+
repetitions_max_cutoff,
642678
cond_check_special_characters,
643679
special_characters,
644680
special_characters_max_cutoff,
@@ -669,6 +705,13 @@ def filtering(
669705
number_words_max_cutoff,
670706
):
671707
return False
708+
if cond_check_repetitions_removal:
709+
if not Filtering.check_repetitions_removal(
710+
document,
711+
repetitions_length,
712+
repetitions_max_cutoff,
713+
):
714+
return False
672715
if cond_check_special_characters:
673716
if not Filtering.check_special_characters(
674717
document,
@@ -756,6 +799,9 @@ def __call__(self, example):
756799
strip_characters=self.param["strip_characters"],
757800
number_words_min_cutoff=self.param["number_words_min_cutoff"],
758801
number_words_max_cutoff=self.param["number_words_max_cutoff"],
802+
cond_check_repetitions_removal=self.param["check_repetitions_removal"],
803+
repetitions_length=self.param["repetitions_length"],
804+
repetitions_max_cutoff=self.param["repetitions_max_cutoff"],
759805
cond_check_special_characters=self.param["cond_check_special_characters"],
760806
special_characters=self.param["special_characters"],
761807
special_characters_max_cutoff=self.param["special_characters_max_cutoff"],

ac_dc/parameters_filtering.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828
"strip_characters": special_characters_default,
2929
"number_words_min_cutoff": 1,
3030
"number_words_max_cutoff": 100000,
31+
"check_repetitions_removal": True,
32+
"repetitions_length": 10,
33+
"repetitions_max_cutoff": 0.106,
3134
"cond_check_special_characters": True,
3235
"special_characters": special_characters_default,
3336
"special_characters_max_cutoff": 0.4,
@@ -56,6 +59,9 @@
5659
"strip_characters": special_characters_default,
5760
"number_words_min_cutoff": 1,
5861
"number_words_max_cutoff": 100000,
62+
"check_repetitions_removal": True,
63+
"repetitions_length": 10,
64+
"repetitions_max_cutoff": 0.106,
5965
"cond_check_special_characters": True,
6066
"special_characters": special_characters_default,
6167
"special_characters_max_cutoff": 0.3,
@@ -84,6 +90,9 @@
8490
"strip_characters": special_characters_default,
8591
"number_words_min_cutoff": 1,
8692
"number_words_max_cutoff": 100000,
93+
"check_repetitions_removal": True,
94+
"repetitions_length": 10,
95+
"repetitions_max_cutoff": 0.106,
8796
"cond_check_special_characters": True,
8897
"special_characters": special_characters_default,
8998
"special_characters_max_cutoff": 0.45,
@@ -112,6 +121,9 @@
112121
"strip_characters": special_characters_default,
113122
"number_words_min_cutoff": 1,
114123
"number_words_max_cutoff": 100000,
124+
"check_repetitions_removal": True,
125+
"repetitions_length": 10,
126+
"repetitions_max_cutoff": 0.106,
115127
"cond_check_special_characters": True,
116128
"special_characters": special_characters_default,
117129
"special_characters_max_cutoff": 0.5,
@@ -140,6 +152,9 @@
140152
"strip_characters": special_characters_default,
141153
"number_words_min_cutoff": 1,
142154
"number_words_max_cutoff": 100000,
155+
"check_repetitions_removal": True,
156+
"repetitions_length": 10,
157+
"repetitions_max_cutoff": 0.106,
143158
"cond_check_special_characters": True,
144159
"special_characters": special_characters_default,
145160
"special_characters_max_cutoff": 0.25,
@@ -168,6 +183,9 @@
168183
"strip_characters": special_characters_default,
169184
"number_words_min_cutoff": 1,
170185
"number_words_max_cutoff": 100000,
186+
"check_repetitions_removal": True,
187+
"repetitions_length": 10,
188+
"repetitions_max_cutoff": 0.106,
171189
"cond_check_special_characters": True,
172190
"special_characters": special_characters_default,
173191
"special_characters_max_cutoff": 0.275,
@@ -196,6 +214,9 @@
196214
"strip_characters": special_characters_default,
197215
"number_words_min_cutoff": 1,
198216
"number_words_max_cutoff": 100000,
217+
"check_repetitions_removal": True,
218+
"repetitions_length": 10,
219+
"repetitions_max_cutoff": 0.106,
199220
"cond_check_special_characters": True,
200221
"special_characters": special_characters_default,
201222
"special_characters_max_cutoff": 0.35,
@@ -224,6 +245,9 @@
224245
"strip_characters": special_characters_default,
225246
"number_words_min_cutoff": 20,
226247
"number_words_max_cutoff": 100000,
248+
"check_repetitions_removal": True,
249+
"repetitions_length": 10,
250+
"repetitions_max_cutoff": 0.106,
227251
"cond_check_special_characters": True,
228252
"special_characters": special_characters_default,
229253
"special_characters_max_cutoff": 0.4,
@@ -252,6 +276,9 @@
252276
"strip_characters": special_characters_default,
253277
"number_words_min_cutoff": 1,
254278
"number_words_max_cutoff": 100000,
279+
"check_repetitions_removal": True,
280+
"repetitions_length": 10,
281+
"repetitions_max_cutoff": 0.106,
255282
"cond_check_special_characters": True,
256283
"special_characters": special_characters_default,
257284
"special_characters_max_cutoff": 0.3,
@@ -280,6 +307,9 @@
280307
"strip_characters": special_characters_default,
281308
"number_words_min_cutoff": 1,
282309
"number_words_max_cutoff": 100000,
310+
"check_repetitions_removal": True,
311+
"repetitions_length": 10,
312+
"repetitions_max_cutoff": 0.106,
283313
"cond_check_special_characters": True,
284314
"special_characters": special_characters_default,
285315
"special_characters_max_cutoff": 0.3,
@@ -308,6 +338,9 @@
308338
"strip_characters": special_characters_default,
309339
"number_words_min_cutoff": 1,
310340
"number_words_max_cutoff": 100000,
341+
"check_repetitions_removal": True,
342+
"repetitions_length": 10,
343+
"repetitions_max_cutoff": 0.106,
311344
"cond_check_special_characters": True,
312345
"special_characters": special_characters_default,
313346
"special_characters_max_cutoff": 0.35,
@@ -336,6 +369,9 @@
336369
"strip_characters": special_characters_default,
337370
"number_words_min_cutoff": 1,
338371
"number_words_max_cutoff": 100000,
372+
"check_repetitions_removal": True,
373+
"repetitions_length": 10,
374+
"repetitions_max_cutoff": 0.106,
339375
"cond_check_special_characters": True,
340376
"special_characters": special_characters_default,
341377
"special_characters_max_cutoff": 0.3,
@@ -364,6 +400,9 @@
364400
"strip_characters": special_characters_default,
365401
"number_words_min_cutoff": 1,
366402
"number_words_max_cutoff": 100000,
403+
"check_repetitions_removal": True,
404+
"repetitions_length": 10,
405+
"repetitions_max_cutoff": 0.106,
367406
"cond_check_special_characters": True,
368407
"special_characters": special_characters_default,
369408
"special_characters_max_cutoff": 0.35,
@@ -392,6 +431,9 @@
392431
"strip_characters": special_characters_default,
393432
"number_words_min_cutoff": 1,
394433
"number_words_max_cutoff": 100000,
434+
"check_repetitions_removal": True,
435+
"repetitions_length": 10,
436+
"repetitions_max_cutoff": 0.106,
395437
"cond_check_special_characters": True,
396438
"special_characters": special_characters_default,
397439
"special_characters_max_cutoff": 0.25,
@@ -420,6 +462,9 @@
420462
"strip_characters": special_characters_default,
421463
"number_words_min_cutoff": 1,
422464
"number_words_max_cutoff": 100000,
465+
"check_repetitions_removal": True,
466+
"repetitions_length": 10,
467+
"repetitions_max_cutoff": 0.106,
423468
"cond_check_special_characters": True,
424469
"special_characters": special_characters_default,
425470
"special_characters_max_cutoff": 0.25,
@@ -448,6 +493,9 @@
448493
"strip_characters": special_characters_default,
449494
"number_words_min_cutoff": 1,
450495
"number_words_max_cutoff": 100000,
496+
"check_repetitions_removal": True,
497+
"repetitions_length": 10,
498+
"repetitions_max_cutoff": 0.106,
451499
"cond_check_special_characters": True,
452500
"special_characters": special_characters_default,
453501
"special_characters_max_cutoff": 0.2,
@@ -476,6 +524,9 @@
476524
"strip_characters": special_characters_default,
477525
"number_words_min_cutoff": 1,
478526
"number_words_max_cutoff": 100000,
527+
"check_repetitions_removal": True,
528+
"repetitions_length": 10,
529+
"repetitions_max_cutoff": 0.106,
479530
"cond_check_special_characters": True,
480531
"special_characters": special_characters_default,
481532
"special_characters_max_cutoff": 0.25,
@@ -504,6 +555,9 @@
504555
"strip_characters": special_characters_default,
505556
"number_words_min_cutoff": 1,
506557
"number_words_max_cutoff": 100000,
558+
"check_repetitions_removal": True,
559+
"repetitions_length": 10,
560+
"repetitions_max_cutoff": 0.106,
507561
"cond_check_special_characters": True,
508562
"special_characters": special_characters_default,
509563
"special_characters_max_cutoff": 0.3,
@@ -532,6 +586,9 @@
532586
"strip_characters": special_characters_default,
533587
"number_words_min_cutoff": 1,
534588
"number_words_max_cutoff": 100000,
589+
"check_repetitions_removal": True,
590+
"repetitions_length": 10,
591+
"repetitions_max_cutoff": 0.106,
535592
"cond_check_special_characters": True,
536593
"special_characters": special_characters_default,
537594
"special_characters_max_cutoff": 0.3,
@@ -560,6 +617,9 @@
560617
"strip_characters": special_characters_default,
561618
"number_words_min_cutoff": 1,
562619
"number_words_max_cutoff": 100000,
620+
"check_repetitions_removal": True,
621+
"repetitions_length": 10,
622+
"repetitions_max_cutoff": 0.106,
563623
"cond_check_special_characters": True,
564624
"special_characters": special_characters_default,
565625
"special_characters_max_cutoff": 0.275,
@@ -588,6 +648,9 @@
588648
"strip_characters": special_characters_default,
589649
"number_words_min_cutoff": 1,
590650
"number_words_max_cutoff": 100000,
651+
"check_repetitions_removal": True,
652+
"repetitions_length": 10,
653+
"repetitions_max_cutoff": 0.106,
591654
"cond_check_special_characters": True,
592655
"special_characters": special_characters_default,
593656
"special_characters_max_cutoff": 0.25,
@@ -616,6 +679,9 @@
616679
"strip_characters": special_characters_default,
617680
"number_words_min_cutoff": 1,
618681
"number_words_max_cutoff": 100000,
682+
"check_repetitions_removal": True,
683+
"repetitions_length": 10,
684+
"repetitions_max_cutoff": 0.106,
619685
"cond_check_special_characters": True,
620686
"special_characters": special_characters_default,
621687
"special_characters_max_cutoff": 0.25,
@@ -644,6 +710,9 @@
644710
"strip_characters": special_characters_default,
645711
"number_words_min_cutoff": 1,
646712
"number_words_max_cutoff": 100000,
713+
"check_repetitions_removal": True,
714+
"repetitions_length": 10,
715+
"repetitions_max_cutoff": 0.106,
647716
"cond_check_special_characters": True,
648717
"special_characters": special_characters_default,
649718
"special_characters_max_cutoff": 0.4,
@@ -672,6 +741,9 @@
672741
"strip_characters": special_characters_default,
673742
"number_words_min_cutoff": 1,
674743
"number_words_max_cutoff": 100000,
744+
"check_repetitions_removal": True,
745+
"repetitions_length": 10,
746+
"repetitions_max_cutoff": 0.106,
675747
"cond_check_special_characters": True,
676748
"special_characters": special_characters_default,
677749
"special_characters_max_cutoff": 0.35,
@@ -700,6 +772,9 @@
700772
"strip_characters": special_characters_default,
701773
"number_words_min_cutoff": 1,
702774
"number_words_max_cutoff": 100000,
775+
"check_repetitions_removal": True,
776+
"repetitions_length": 10,
777+
"repetitions_max_cutoff": 0.106,
703778
"cond_check_special_characters": True,
704779
"special_characters": special_characters_default,
705780
"special_characters_max_cutoff": 0.3,
@@ -728,6 +803,9 @@
728803
"strip_characters": special_characters_default,
729804
"number_words_min_cutoff": 1,
730805
"number_words_max_cutoff": 100000,
806+
"check_repetitions_removal": True,
807+
"repetitions_length": 10,
808+
"repetitions_max_cutoff": 0.106,
731809
"cond_check_special_characters": True,
732810
"special_characters": special_characters_default,
733811
"special_characters_max_cutoff": 0.4,

0 commit comments

Comments
 (0)