Skip to content

Commit 84184c8

Browse files
committed
remove oscar dependency
1 parent fc7a967 commit 84184c8

7 files changed

Lines changed: 144 additions & 1411 deletions

File tree

ac_dc/download_kenlm_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Usage:
44
python download_kenlm_models.py --output_path /tmp/
55
6-
All 48 kenlm language models will be saved under /tmp.
6+
All kenlm language models will be saved under /tmp.
77
"""
88

99
import argparse
Lines changed: 54 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,17 @@
1919

2020
class LoadParameters:
2121
@staticmethod
22-
def load_parameters(lang_oscar_id):
23-
if lang_oscar_id in parameters_filtering:
24-
param = parameters_filtering[lang_oscar_id]
22+
def load_parameters(lang_dataset_id):
23+
if lang_dataset_id in parameters_filtering:
24+
param = parameters_filtering[lang_dataset_id]
2525
else:
2626
param = parameters_filtering["default"]
2727
return param
2828

2929
@staticmethod
30-
def load_stopwords(lang_oscar_id):
30+
def load_stopwords(lang_dataset_id):
3131
stopwords_lang_id = langs_id.loc[
32-
langs_id["oscar_id"] == lang_oscar_id, "stopwords_id"
32+
langs_id["dataset_id"] == lang_dataset_id, "stopwords_id"
3333
].iloc[0]
3434
if stopwords_lang_id:
3535
stopwords_lang = set(stopwords[stopwords_lang_id])
@@ -38,9 +38,9 @@ def load_stopwords(lang_oscar_id):
3838
return stopwords_lang
3939

4040
@staticmethod
41-
def load_badwords(lang_oscar_id):
41+
def load_badwords(lang_dataset_id):
4242
badwords_lang_id = langs_id.loc[
43-
langs_id["oscar_id"] == lang_oscar_id, "badwords_id"
43+
langs_id["dataset_id"] == lang_dataset_id, "badwords_id"
4444
].iloc[0]
4545
if badwords_lang_id:
4646
badwords_lang = set(badwords[badwords_lang_id])
@@ -49,9 +49,9 @@ def load_badwords(lang_oscar_id):
4949
return badwords_lang
5050

5151
@staticmethod
52-
def load_model_lang_id(lang_oscar_id, path_fasttext_model):
52+
def load_model_lang_id(lang_dataset_id, path_fasttext_model):
5353
fasttext_lang_id = langs_id.loc[
54-
langs_id["oscar_id"] == lang_oscar_id, "fasttext_id"
54+
langs_id["dataset_id"] == lang_dataset_id, "fasttext_id"
5555
].iloc[0]
5656
if fasttext_lang_id:
5757
model_lang_id = fasttext.load_model(path_fasttext_model)
@@ -60,9 +60,9 @@ def load_model_lang_id(lang_oscar_id, path_fasttext_model):
6060
return model_lang_id
6161

6262
@staticmethod
63-
def load_sentencepiece_model(lang_oscar_id, path_sentencepiece_model):
63+
def load_sentencepiece_model(lang_dataset_id, path_sentencepiece_model):
6464
sentencepiece_lang_id = langs_id.loc[
65-
langs_id["oscar_id"] == lang_oscar_id, "sentencepiece_id"
65+
langs_id["dataset_id"] == lang_dataset_id, "sentencepiece_id"
6666
].iloc[0]
6767
if sentencepiece_lang_id:
6868
sentencepiece_model = sentencepiece.SentencePieceProcessor()
@@ -72,9 +72,9 @@ def load_sentencepiece_model(lang_oscar_id, path_sentencepiece_model):
7272
return sentencepiece_model
7373

7474
@staticmethod
75-
def load_kenlm_model(lang_oscar_id, path_kenlm_model):
75+
def load_kenlm_model(lang_dataset_id, path_kenlm_model):
7676
kenlm_lang_id = langs_id.loc[
77-
langs_id["oscar_id"] == lang_oscar_id, "kenlm_id"
77+
langs_id["dataset_id"] == lang_dataset_id, "kenlm_id"
7878
].iloc[0]
7979
if kenlm_lang_id:
8080
kenlm_model = kenlm.Model(path_kenlm_model)
@@ -377,10 +377,10 @@ def modifying_documents(
377377
return document
378378

379379

380-
class OscarModifyingDocuments:
381-
def __init__(self, lang_oscar_id):
382-
self.lang_oscar_id = lang_oscar_id
383-
self.param = LoadParameters.load_parameters(lang_oscar_id)
380+
class FunctionDatasetModifyingDocuments:
381+
def __init__(self, lang_dataset_id):
382+
self.lang_dataset_id = lang_dataset_id
383+
self.param = LoadParameters.load_parameters(lang_dataset_id)
384384

385385
def __call__(self, example):
386386
example["text"] = ModifyingDocuments.modifying_documents(
@@ -400,7 +400,7 @@ def __call__(self, example):
400400
return example
401401

402402
def __reduce__(self):
403-
return (self.__class__, (self.lang_oscar_id,))
403+
return (self.__class__, (self.lang_dataset_id,))
404404

405405

406406
class Filtering:
@@ -569,28 +569,28 @@ def compute_lang_id_pred_score(document, model_lang_id):
569569
pred = model_lang_id.predict(document)
570570
lang_pred_fasttext_id = pred[0][0].replace("__label__", "")
571571
score_pred = pred[1][0]
572-
lang_pred_oscar_id = langs_id.loc[
573-
langs_id["fasttext_id"] == lang_pred_fasttext_id, "oscar_id"
572+
lang_pred_dataset_id = langs_id.loc[
573+
langs_id["fasttext_id"] == lang_pred_fasttext_id, "dataset_id"
574574
]
575-
if len(lang_pred_oscar_id) > 0:
576-
lang_pred_oscar_id = lang_pred_oscar_id.iloc[0]
575+
if len(lang_pred_dataset_id) > 0:
576+
lang_pred_dataset_id = lang_pred_dataset_id.iloc[0]
577577
else:
578-
lang_pred_oscar_id = "unknown"
579-
return lang_pred_oscar_id, score_pred
578+
lang_pred_dataset_id = "unknown"
579+
return lang_pred_dataset_id, score_pred
580580

581581
@staticmethod
582582
def check_lang_id(
583583
document,
584-
lang_oscar_id,
584+
lang_dataset_id,
585585
model_lang_id,
586586
lang_id_min_cutoff,
587587
):
588588
cond = True
589589
if model_lang_id:
590-
lang_pred_oscar_id, score_pred = Filtering.compute_lang_id_pred_score(
590+
lang_pred_dataset_id, score_pred = Filtering.compute_lang_id_pred_score(
591591
document, model_lang_id
592592
)
593-
cond = (lang_pred_oscar_id == lang_oscar_id) and (
593+
cond = (lang_pred_dataset_id == lang_dataset_id) and (
594594
score_pred >= lang_id_min_cutoff
595595
)
596596
return cond
@@ -655,7 +655,7 @@ def filtering(
655655
badwords,
656656
badwords_max_cutoff,
657657
cond_check_lang_id,
658-
lang_oscar_id,
658+
lang_dataset_id,
659659
model_lang_id,
660660
lang_id_min_cutoff,
661661
cond_check_perplexity,
@@ -706,7 +706,7 @@ def filtering(
706706
if cond_check_lang_id:
707707
if not Filtering.check_lang_id(
708708
document,
709-
lang_oscar_id,
709+
lang_dataset_id,
710710
model_lang_id,
711711
lang_id_min_cutoff,
712712
):
@@ -722,33 +722,33 @@ def filtering(
722722
return True
723723

724724

725-
class FuncOscarFiltering:
725+
class FunctionDatasetFiltering:
726726
def __init__(
727727
self,
728-
lang_oscar_id,
728+
lang_dataset_id,
729729
path_fasttext_model,
730730
path_sentencepiece_model,
731731
path_kenlm_model,
732732
):
733-
self.lang_oscar_id = lang_oscar_id
733+
self.lang_dataset_id = lang_dataset_id
734734
self.path_fasttext_model = path_fasttext_model
735735
self.path_sentencepiece_model = path_sentencepiece_model
736736
self.path_kenlm_model = path_kenlm_model
737737

738-
self.param = LoadParameters.load_parameters(lang_oscar_id)
739-
self.stopwords = LoadParameters.load_stopwords(lang_oscar_id)
740-
self.badwords = LoadParameters.load_badwords(lang_oscar_id)
738+
self.param = LoadParameters.load_parameters(lang_dataset_id)
739+
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
740+
self.badwords = LoadParameters.load_badwords(lang_dataset_id)
741741
self.model_lang_id = LoadParameters.load_model_lang_id(
742-
lang_oscar_id, path_fasttext_model
742+
lang_dataset_id, path_fasttext_model
743743
)
744744
self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
745-
lang_oscar_id, path_sentencepiece_model
745+
lang_dataset_id, path_sentencepiece_model
746746
)
747747
self.sentencepiece_model_tok = (
748748
self.sentencepiece_model if self.param["tokenization"] else None
749749
)
750750
self.kenlm_model = LoadParameters.load_kenlm_model(
751-
lang_oscar_id, path_kenlm_model
751+
lang_dataset_id, path_kenlm_model
752752
)
753753

754754
def __call__(self, example):
@@ -772,7 +772,7 @@ def __call__(self, example):
772772
badwords=self.badwords,
773773
badwords_max_cutoff=self.param["badwords_max_cutoff"],
774774
cond_check_lang_id=self.param["cond_check_lang_id"],
775-
lang_oscar_id=self.lang_oscar_id,
775+
lang_dataset_id=self.lang_dataset_id,
776776
model_lang_id=self.model_lang_id,
777777
lang_id_min_cutoff=self.param["lang_id_min_cutoff"],
778778
cond_check_perplexity=self.param["cond_check_perplexity"],
@@ -786,50 +786,52 @@ def __reduce__(self):
786786
return (
787787
self.__class__,
788788
(
789-
self.lang_oscar_id,
789+
self.lang_dataset_id,
790790
self.path_fasttext_model,
791791
self.path_sentencepiece_model,
792792
self.path_kenlm_model,
793793
),
794794
)
795795

796796

797-
class OscarFiltering:
797+
class DatasetFiltering:
798798
def __init__(
799799
self,
800800
dataset,
801-
lang_oscar_id,
801+
lang_dataset_id,
802802
path_fasttext_model,
803803
path_sentencepiece_model,
804804
path_kenlm_model,
805805
num_proc,
806-
path_dir_save_oscar,
806+
path_dir_save_dataset,
807807
):
808808
self.ds = dataset
809-
self.lang_oscar_id = lang_oscar_id
809+
self.lang_dataset_id = lang_dataset_id
810810
self.path_fasttext_model = path_fasttext_model
811811
self.path_sentencepiece_model = path_sentencepiece_model
812812
self.path_kenlm_model = path_kenlm_model
813813
self.num_proc = num_proc
814-
self.path_dir_save_oscar = path_dir_save_oscar
814+
self.path_dir_save_dataset = path_dir_save_dataset
815815

816816
def modifying_documents(self):
817-
oscar_modifying_documents = OscarModifyingDocuments(self.lang_oscar_id)
818-
self.ds = self.ds.map(oscar_modifying_documents, num_proc=self.num_proc)
817+
dataset_modifying_documents = FunctionDatasetModifyingDocuments(
818+
self.lang_dataset_id
819+
)
820+
self.ds = self.ds.map(dataset_modifying_documents, num_proc=self.num_proc)
819821

820822
def filtering(self):
821-
func_oscar_filtering = FuncOscarFiltering(
822-
self.lang_oscar_id,
823+
func_dataset_filtering = FunctionDatasetFiltering(
824+
self.lang_dataset_id,
823825
self.path_fasttext_model,
824826
self.path_sentencepiece_model,
825827
self.path_kenlm_model,
826828
)
827-
self.ds = self.ds.filter(func_oscar_filtering, num_proc=self.num_proc)
829+
self.ds = self.ds.filter(func_dataset_filtering, num_proc=self.num_proc)
828830

829831
def save_dataset(self):
830-
pathlib.Path(self.path_dir_save_oscar).mkdir(parents=True, exist_ok=True)
832+
pathlib.Path(self.path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
831833
path_dir_save_dataset = pathlib.PurePath(
832-
self.path_dir_save_oscar, self.lang_oscar_id
834+
self.path_dir_save_dataset, self.lang_dataset_id
833835
)
834836
pathlib.Path(path_dir_save_dataset).mkdir(parents=True, exist_ok=True)
835837
self.ds.save_to_disk(path_dir_save_dataset)

0 commit comments

Comments
 (0)