bigscience-workshop
diff --git a/‎ac_dc/README.md‎
Lines changed: 2 additions & 2 deletions b/‎ac_dc/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ac_dc/explanation_filtering_pipeline.pdf‎
-241 Bytes b/‎ac_dc/explanation_filtering_pipeline.pdf‎
-241 Bytes
diff --git a/‎ac_dc/filtering.py‎
Lines changed: 33 additions & 33 deletions b/‎ac_dc/filtering.py‎
Lines changed: 33 additions & 33 deletions
diff --git a/‎ac_dc/badwords.py‎ ‎ac_dc/flagged_words.py‎ac_dc/badwords.py renamed to ac_dc/flagged_words.py
Lines changed: 17 additions & 17 deletions b/‎ac_dc/badwords.py‎ ‎ac_dc/flagged_words.py‎ac_dc/badwords.py renamed to ac_dc/flagged_words.py
Lines changed: 17 additions & 17 deletions
@@ -13,9 +13,9 @@ The supported languages are defined in the file [languages_id.py](https://github
 
 Take a look at the pdf [explanation_filtering_pipeline.pdf](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/explanation_filtering_pipeline.pdf) for an explanation of the filtering pipeline.
 
-#### 1. Define the lists of stop words and bad words, and check how the anonymization and the normalization of texts are done
+#### 1. Define the lists of stop words and flagged words, and check how the anonymization and the normalization of texts are done
 
-You might want to redefine the lists of stop words and bad words for robustness or ethical reasons in the files [stopwords.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/stopwords.py) and [badwords.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/badwords.py).
+You might want to redefine the lists of stop words and flagged words for robustness or ethical reasons in the files [stopwords.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/stopwords.py) and [flagged_words.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/flagged_words.py).
 
 Less importantly, you can also check how the anonymization and the normalization of texts are done in the files [anonymization.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/anonymization.py) and [normalization.py](https://github.com/bigscience-workshop/data_tooling/blob/master/ac_dc/normalization.py) (if applicable, default is to use the anonymization and not to use the normalization).
 
 
@@ -13,7 +13,7 @@
 from parameters_filtering import parameters_filtering
 from normalization import normalization
 from stopwords import stopwords
-from badwords import badwords
+from flagged_words import flagged_words
 
 
 class LoadParameters:
@@ -37,15 +37,15 @@ def load_stopwords(lang_dataset_id):
         return stopwords_lang
 
     @staticmethod
-    def load_badwords(lang_dataset_id):
-        badwords_lang_id = langs_id.loc[
-            langs_id["dataset_id"] == lang_dataset_id, "badwords_id"
+    def load_flagged_words(lang_dataset_id):
+        flagged_words_lang_id = langs_id.loc[
+            langs_id["dataset_id"] == lang_dataset_id, "flagged_words_id"
         ].iloc[0]
-        if badwords_lang_id:
-            badwords_lang = set(badwords[badwords_lang_id])
+        if flagged_words_lang_id:
+            flagged_words_lang = set(flagged_words[flagged_words_lang_id])
         else:
-            badwords_lang = None
-        return badwords_lang
+            flagged_words_lang = None
+        return flagged_words_lang
 
     @staticmethod
     def load_model_lang_id(lang_dataset_id, path_fasttext_model):
@@ -533,14 +533,14 @@ def check_stopwords(
         return cond
 
     @staticmethod
-    def compute_badwords_ratio(
+    def compute_flagged_words_ratio(
         document,
         sentencepiece_model_tok,
         strip_characters,
         cond_words_augmentation,
         words_augmentation_group_sizes,
         words_augmentation_join_char,
-        badwords,
+        flagged_words,
     ):
         words = ModifyingDocuments.get_words_from_document(
             document,
@@ -559,36 +559,36 @@ def compute_badwords_ratio(
                 for group_size in words_augmentation_group_sizes
             ]
             augmentation = [word for augm in augmentation for word in augm]
-        badwords_ratio = len(
-            [word for word in words + augmentation if word in badwords]
+        flagged_words_ratio = len(
+            [word for word in words + augmentation if word in flagged_words]
         ) / len(words)
-        if badwords_ratio > 1.0:
-            badwords_ratio = 1.0
-        return badwords_ratio
+        if flagged_words_ratio > 1.0:
+            flagged_words_ratio = 1.0
+        return flagged_words_ratio
 
     @staticmethod
-    def check_badwords(
+    def check_flagged_words(
         document,
         sentencepiece_model_tok,
         strip_characters,
         cond_words_augmentation,
         words_augmentation_group_sizes,
         words_augmentation_join_char,
-        badwords,
-        badwords_max_cutoff,
+        flagged_words,
+        flagged_words_max_cutoff,
     ):
         cond = True
-        if badwords:
-            badwords_ratio = Filtering.compute_badwords_ratio(
+        if flagged_words:
+            flagged_words_ratio = Filtering.compute_flagged_words_ratio(
                 document,
                 sentencepiece_model_tok,
                 strip_characters,
                 cond_words_augmentation,
                 words_augmentation_group_sizes,
                 words_augmentation_join_char,
-                badwords,
+                flagged_words,
             )
-            cond = badwords_ratio <= badwords_max_cutoff
+            cond = flagged_words_ratio <= flagged_words_max_cutoff
         return cond
 
     @staticmethod
@@ -682,9 +682,9 @@ def filtering(
         cond_check_stopwords,
         stopwords,
         stopwords_min_cutoff,
-        cond_check_badwords,
-        badwords,
-        badwords_max_cutoff,
+        cond_check_flagged_words,
+        flagged_words,
+        flagged_words_max_cutoff,
         cond_check_lang_id,
         lang_dataset_id,
         model_lang_id,
@@ -729,16 +729,16 @@ def filtering(
                 stopwords_min_cutoff,
             ):
                 return False
-        if cond_check_badwords:
-            if not Filtering.check_badwords(
+        if cond_check_flagged_words:
+            if not Filtering.check_flagged_words(
                 document,
                 sentencepiece_model_tok,
                 strip_characters,
                 cond_words_augmentation,
                 words_augmentation_group_sizes,
                 words_augmentation_join_char,
-                badwords,
-                badwords_max_cutoff,
+                flagged_words,
+                flagged_words_max_cutoff,
             ):
                 return False
         if cond_check_lang_id:
@@ -775,7 +775,7 @@ def __init__(
 
         self.param = LoadParameters.load_parameters(lang_dataset_id)
         self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
-        self.badwords = LoadParameters.load_badwords(lang_dataset_id)
+        self.flagged_words = LoadParameters.load_flagged_words(lang_dataset_id)
         self.model_lang_id = LoadParameters.load_model_lang_id(
             lang_dataset_id, path_fasttext_model
         )
@@ -809,9 +809,9 @@ def __call__(self, example):
             cond_check_stopwords=self.param["cond_check_stopwords"],
             stopwords=self.stopwords,
             stopwords_min_cutoff=self.param["stopwords_min_cutoff"],
-            cond_check_badwords=self.param["cond_check_badwords"],
-            badwords=self.badwords,
-            badwords_max_cutoff=self.param["badwords_max_cutoff"],
+            cond_check_flagged_words=self.param["cond_check_flagged_words"],
+            flagged_words=self.flagged_words,
+            flagged_words_max_cutoff=self.param["flagged_words_max_cutoff"],
             cond_check_lang_id=self.param["cond_check_lang_id"],
             lang_dataset_id=self.lang_dataset_id,
             model_lang_id=self.model_lang_id,
 
@@ -6,7 +6,7 @@
 # https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
 
 
-english_badwords = [
+english_flagged_words = [
     "adult",
     "amateur",
     "anal",
@@ -229,8 +229,8 @@
 ]
 
 
-badwords = {
-    "ar": english_badwords
+flagged_words = {
+    "ar": english_flagged_words
     + [
         "احتلام",
         "اغتصاب",
@@ -271,7 +271,7 @@
         "مفلقسة",
         "نيك",
     ],
-    "ca": english_badwords
+    "ca": english_flagged_words
     + [
         "anal",
         "anus",
@@ -405,8 +405,8 @@
         "x classificat",
         "xxx",
     ],
-    "en": english_badwords,
-    "es": english_badwords
+    "en": english_flagged_words,
+    "es": english_flagged_words
     + [
         "Asesinato",
         "Bollera",
@@ -598,7 +598,7 @@
         "xxx",
         "zurullo",
     ],
-    "eu": english_badwords
+    "eu": english_flagged_words
     + [
         "abortu",
         "anal",
@@ -639,7 +639,7 @@
         "bagina",
         "viagra",
     ],
-    "fr": english_badwords
+    "fr": english_flagged_words
     + [
         "MALPT",
         "anal",
@@ -840,7 +840,7 @@
         "éjaculé",
         "étron",
     ],
-    "hi": english_badwords
+    "hi": english_flagged_words
     + [
         "aand",
         "aandu",
@@ -1098,7 +1098,7 @@
         "होमोसेक्सुअल",
         "होर",
     ],
-    "id": english_badwords
+    "id": english_flagged_words
     + [
         "abortus",
         "anal",
@@ -1216,7 +1216,7 @@
         "x diberi peringkat",
         "xxx",
     ],
-    "kn": english_badwords
+    "kn": english_flagged_words
     + [
         "ಗರ್ಭಪಾತ",
         "ಗುದ",
@@ -1359,7 +1359,7 @@
         "x ರೇಟೆಡ್",
         "xxx",
     ],
-    "ml": english_badwords
+    "ml": english_flagged_words
     + [
         "ഗർഭഛിദ്രം",
         "വിശപ്പ്",
@@ -1496,7 +1496,7 @@
         "x റേറ്റുചെയ്തു",
         "xxx",
     ],
-    "mr": english_badwords
+    "mr": english_flagged_words
     + [
         "गर्भपात",
         "गुदा",
@@ -1635,7 +1635,7 @@
         "एक्स रेट केले",
         "xxx",
     ],
-    "pt": english_badwords
+    "pt": english_flagged_words
     + [
         "aborto",
         "amador",
@@ -1833,7 +1833,7 @@
         "xxx",
         "ânus",
     ],
-    "te": english_badwords
+    "te": english_flagged_words
     + [
         "గర్భస్రావం",
         "అంగ",
@@ -1980,7 +1980,7 @@
         "x రేట్",
         "xxx",
     ],
-    "vi": english_badwords
+    "vi": english_flagged_words
     + [
         "sự phá thai",
         "hậu môn",
@@ -2098,7 +2098,7 @@
         "x đánh giá",
         "xxx",
     ],
-    "zh": english_badwords
+    "zh": english_flagged_words
     + [
         "13.",
         "13点",