Skip to content

Commit 298032e

Browse files
committed
rename badwords to flagged words
1 parent f3ca4a8 commit 298032e

8 files changed

Lines changed: 151 additions & 151 deletions

ac_dc/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ The supported languages are defined in the file [languages_id.py](https://github
1313

1414
Take a look at the pdf [explanation_filtering_pipeline.pdf](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/explanation_filtering_pipeline.pdf) for an explanation of the filtering pipeline.
1515

16-
#### 1. Define the lists of stop words and bad words, and check how the anonymization and the normalization of texts are done
16+
#### 1. Define the lists of stop words and flagged words, and check how the anonymization and the normalization of texts are done
1717

18-
You might want to redefine the lists of stop words and bad words for robustness or ethical reasons in the files [stopwords.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/stopwords.py) and [badwords.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/badwords.py).
18+
You might want to redefine the lists of stop words and flagged words for robustness or ethical reasons in the files [stopwords.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/stopwords.py) and [flagged_words.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/flagged_words.py).
1919

2020
Less importantly, you can also check how the anonymization and the normalization of texts are done in the files [anonymization.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/anonymization.py) and [normalization.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/normalization.py) (if applicable, default is to use the anonymization and not to use the normalization).
2121

-241 Bytes
Binary file not shown.

ac_dc/filtering.py

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from parameters_filtering import parameters_filtering
1414
from normalization import normalization
1515
from stopwords import stopwords
16-
from badwords import badwords
16+
from flagged_words import flagged_words
1717

1818

1919
class LoadParameters:
@@ -37,15 +37,15 @@ def load_stopwords(lang_dataset_id):
3737
return stopwords_lang
3838

3939
@staticmethod
40-
def load_badwords(lang_dataset_id):
41-
badwords_lang_id = langs_id.loc[
42-
langs_id["dataset_id"] == lang_dataset_id, "badwords_id"
40+
def load_flagged_words(lang_dataset_id):
41+
flagged_words_lang_id = langs_id.loc[
42+
langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id"
4343
].iloc[0]
44-
if badwords_lang_id:
45-
badwords_lang = set(badwords[badwords_lang_id])
44+
if flagged_words_lang_id:
45+
flagged_words_lang = set(flagged_words[flagged_words_lang_id])
4646
else:
47-
badwords_lang = None
48-
return badwords_lang
47+
flagged_words_lang = None
48+
return flagged_words_lang
4949

5050
@staticmethod
5151
def load_model_lang_id(lang_dataset_id, path_fasttext_model):
@@ -533,14 +533,14 @@ def check_stopwords(
533533
return cond
534534

535535
@staticmethod
536-
def compute_badwords_ratio(
536+
def compute_flagged_words_ratio(
537537
document,
538538
sentencepiece_model_tok,
539539
strip_characters,
540540
cond_words_augmentation,
541541
words_augmentation_group_sizes,
542542
words_augmentation_join_char,
543-
badwords,
543+
flagged_words,
544544
):
545545
words = ModifyingDocuments.get_words_from_document(
546546
document,
@@ -559,36 +559,36 @@ def compute_badwords_ratio(
559559
for group_size in words_augmentation_group_sizes
560560
]
561561
augmentation = [word for augm in augmentation for word in augm]
562-
badwords_ratio = len(
563-
[word for word in words + augmentation if word in badwords]
562+
flagged_words_ratio = len(
563+
[word for word in words + augmentation if word in flagged_words]
564564
) / len(words)
565-
if badwords_ratio > 1.0:
566-
badwords_ratio = 1.0
567-
return badwords_ratio
565+
if flagged_words_ratio > 1.0:
566+
flagged_words_ratio = 1.0
567+
return flagged_words_ratio
568568

569569
@staticmethod
570-
def check_badwords(
570+
def check_flagged_words(
571571
document,
572572
sentencepiece_model_tok,
573573
strip_characters,
574574
cond_words_augmentation,
575575
words_augmentation_group_sizes,
576576
words_augmentation_join_char,
577-
badwords,
578-
badwords_max_cutoff,
577+
flagged_words,
578+
flagged_words_max_cutoff,
579579
):
580580
cond = True
581-
if badwords:
582-
badwords_ratio = Filtering.compute_badwords_ratio(
581+
if flagged_words:
582+
flagged_words_ratio = Filtering.compute_flagged_words_ratio(
583583
document,
584584
sentencepiece_model_tok,
585585
strip_characters,
586586
cond_words_augmentation,
587587
words_augmentation_group_sizes,
588588
words_augmentation_join_char,
589-
badwords,
589+
flagged_words,
590590
)
591-
cond = badwords_ratio <= badwords_max_cutoff
591+
cond = flagged_words_ratio <= flagged_words_max_cutoff
592592
return cond
593593

594594
@staticmethod
@@ -682,9 +682,9 @@ def filtering(
682682
cond_check_stopwords,
683683
stopwords,
684684
stopwords_min_cutoff,
685-
cond_check_badwords,
686-
badwords,
687-
badwords_max_cutoff,
685+
cond_check_flagged_words,
686+
flagged_words,
687+
flagged_words_max_cutoff,
688688
cond_check_lang_id,
689689
lang_dataset_id,
690690
model_lang_id,
@@ -729,16 +729,16 @@ def filtering(
729729
stopwords_min_cutoff,
730730
):
731731
return False
732-
if cond_check_badwords:
733-
if not Filtering.check_badwords(
732+
if cond_check_flagged_words:
733+
if not Filtering.check_flagged_words(
734734
document,
735735
sentencepiece_model_tok,
736736
strip_characters,
737737
cond_words_augmentation,
738738
words_augmentation_group_sizes,
739739
words_augmentation_join_char,
740-
badwords,
741-
badwords_max_cutoff,
740+
flagged_words,
741+
flagged_words_max_cutoff,
742742
):
743743
return False
744744
if cond_check_lang_id:
@@ -775,7 +775,7 @@ def __init__(
775775

776776
self.param = LoadParameters.load_parameters(lang_dataset_id)
777777
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
778-
self.badwords = LoadParameters.load_badwords(lang_dataset_id)
778+
self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
779779
self.model_lang_id = LoadParameters.load_model_lang_id(
780780
lang_dataset_id, path_fasttext_model
781781
)
@@ -809,9 +809,9 @@ def __call__(self, example):
809809
cond_check_stopwords=self.param["cond_check_stopwords"],
810810
stopwords=self.stopwords,
811811
stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
812-
cond_check_badwords=self.param["cond_check_badwords"],
813-
badwords=self.badwords,
814-
badwords_max_cutoff=self.param["badwords_max_cutoff"],
812+
cond_check_flagged_words=self.param["cond_check_flagged_words"],
813+
flagged_words=self.flagged_words,
814+
flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"],
815815
cond_check_lang_id=self.param["cond_check_lang_id"],
816816
lang_dataset_id=self.lang_dataset_id,
817817
model_lang_id=self.model_lang_id,
Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
77

88

9-
english_badwords = [
9+
english_flagged_words = [
1010
"adult",
1111
"amateur",
1212
"anal",
@@ -229,8 +229,8 @@
229229
]
230230

231231

232-
badwords = {
233-
"ar": english_badwords
232+
flagged_words = {
233+
"ar": english_flagged_words
234234
+ [
235235
"احتلام",
236236
"اغتصاب",
@@ -271,7 +271,7 @@
271271
"مفلقسة",
272272
"نيك",
273273
],
274-
"ca": english_badwords
274+
"ca": english_flagged_words
275275
+ [
276276
"anal",
277277
"anus",
@@ -405,8 +405,8 @@
405405
"x classificat",
406406
"xxx",
407407
],
408-
"en": english_badwords,
409-
"es": english_badwords
408+
"en": english_flagged_words,
409+
"es": english_flagged_words
410410
+ [
411411
"Asesinato",
412412
"Bollera",
@@ -598,7 +598,7 @@
598598
"xxx",
599599
"zurullo",
600600
],
601-
"eu": english_badwords
601+
"eu": english_flagged_words
602602
+ [
603603
"abortu",
604604
"anal",
@@ -639,7 +639,7 @@
639639
"bagina",
640640
"viagra",
641641
],
642-
"fr": english_badwords
642+
"fr": english_flagged_words
643643
+ [
644644
"MALPT",
645645
"anal",
@@ -840,7 +840,7 @@
840840
"éjaculé",
841841
"étron",
842842
],
843-
"hi": english_badwords
843+
"hi": english_flagged_words
844844
+ [
845845
"aand",
846846
"aandu",
@@ -1098,7 +1098,7 @@
10981098
"होमोसेक्सुअल",
10991099
"होर",
11001100
],
1101-
"id": english_badwords
1101+
"id": english_flagged_words
11021102
+ [
11031103
"abortus",
11041104
"anal",
@@ -1216,7 +1216,7 @@
12161216
"x diberi peringkat",
12171217
"xxx",
12181218
],
1219-
"kn": english_badwords
1219+
"kn": english_flagged_words
12201220
+ [
12211221
"ಗರ್ಭಪಾತ",
12221222
"ಗುದ",
@@ -1359,7 +1359,7 @@
13591359
"x ರೇಟೆಡ್",
13601360
"xxx",
13611361
],
1362-
"ml": english_badwords
1362+
"ml": english_flagged_words
13631363
+ [
13641364
"ഗർഭഛിദ്രം",
13651365
"വിശപ്പ്",
@@ -1496,7 +1496,7 @@
14961496
"x റേറ്റുചെയ്തു",
14971497
"xxx",
14981498
],
1499-
"mr": english_badwords
1499+
"mr": english_flagged_words
15001500
+ [
15011501
"गर्भपात",
15021502
"गुदा",
@@ -1635,7 +1635,7 @@
16351635
"एक्स रेट केले",
16361636
"xxx",
16371637
],
1638-
"pt": english_badwords
1638+
"pt": english_flagged_words
16391639
+ [
16401640
"aborto",
16411641
"amador",
@@ -1833,7 +1833,7 @@
18331833
"xxx",
18341834
"ânus",
18351835
],
1836-
"te": english_badwords
1836+
"te": english_flagged_words
18371837
+ [
18381838
"గర్భస్రావం",
18391839
"అంగ",
@@ -1980,7 +1980,7 @@
19801980
"x రేట్",
19811981
"xxx",
19821982
],
1983-
"vi": english_badwords
1983+
"vi": english_flagged_words
19841984
+ [
19851985
"sự phá thai",
19861986
"hậu môn",
@@ -2098,7 +2098,7 @@
20982098
"x đánh giá",
20992099
"xxx",
21002100
],
2101-
"zh": english_badwords
2101+
"zh": english_flagged_words
21022102
+ [
21032103
"13.",
21042104
"13点",

0 commit comments

Comments
 (0)