1919
2020class LoadParameters :
2121 @staticmethod
22- def load_parameters (lang_oscar_id ):
23- if lang_oscar_id in parameters_filtering :
24- param = parameters_filtering [lang_oscar_id ]
22+ def load_parameters (lang_dataset_id ):
23+ if lang_dataset_id in parameters_filtering :
24+ param = parameters_filtering [lang_dataset_id ]
2525 else :
2626 param = parameters_filtering ["default" ]
2727 return param
2828
2929 @staticmethod
30- def load_stopwords (lang_oscar_id ):
30+ def load_stopwords (lang_dataset_id ):
3131 stopwords_lang_id = langs_id .loc [
32- langs_id ["oscar_id " ] == lang_oscar_id , "stopwords_id"
32+ langs_id ["dataset_id " ] == lang_dataset_id , "stopwords_id"
3333 ].iloc [0 ]
3434 if stopwords_lang_id :
3535 stopwords_lang = set (stopwords [stopwords_lang_id ])
@@ -38,9 +38,9 @@ def load_stopwords(lang_oscar_id):
3838 return stopwords_lang
3939
4040 @staticmethod
41- def load_badwords (lang_oscar_id ):
41+ def load_badwords (lang_dataset_id ):
4242 badwords_lang_id = langs_id .loc [
43- langs_id ["oscar_id " ] == lang_oscar_id , "badwords_id"
43+ langs_id ["dataset_id " ] == lang_dataset_id , "badwords_id"
4444 ].iloc [0 ]
4545 if badwords_lang_id :
4646 badwords_lang = set (badwords [badwords_lang_id ])
@@ -49,9 +49,9 @@ def load_badwords(lang_oscar_id):
4949 return badwords_lang
5050
5151 @staticmethod
52- def load_model_lang_id (lang_oscar_id , path_fasttext_model ):
52+ def load_model_lang_id (lang_dataset_id , path_fasttext_model ):
5353 fasttext_lang_id = langs_id .loc [
54- langs_id ["oscar_id " ] == lang_oscar_id , "fasttext_id"
54+ langs_id ["dataset_id " ] == lang_dataset_id , "fasttext_id"
5555 ].iloc [0 ]
5656 if fasttext_lang_id :
5757 model_lang_id = fasttext .load_model (path_fasttext_model )
@@ -60,9 +60,9 @@ def load_model_lang_id(lang_oscar_id, path_fasttext_model):
6060 return model_lang_id
6161
6262 @staticmethod
63- def load_sentencepiece_model (lang_oscar_id , path_sentencepiece_model ):
63+ def load_sentencepiece_model (lang_dataset_id , path_sentencepiece_model ):
6464 sentencepiece_lang_id = langs_id .loc [
65- langs_id ["oscar_id " ] == lang_oscar_id , "sentencepiece_id"
65+ langs_id ["dataset_id " ] == lang_dataset_id , "sentencepiece_id"
6666 ].iloc [0 ]
6767 if sentencepiece_lang_id :
6868 sentencepiece_model = sentencepiece .SentencePieceProcessor ()
@@ -72,9 +72,9 @@ def load_sentencepiece_model(lang_oscar_id, path_sentencepiece_model):
7272 return sentencepiece_model
7373
7474 @staticmethod
75- def load_kenlm_model (lang_oscar_id , path_kenlm_model ):
75+ def load_kenlm_model (lang_dataset_id , path_kenlm_model ):
7676 kenlm_lang_id = langs_id .loc [
77- langs_id ["oscar_id " ] == lang_oscar_id , "kenlm_id"
77+ langs_id ["dataset_id " ] == lang_dataset_id , "kenlm_id"
7878 ].iloc [0 ]
7979 if kenlm_lang_id :
8080 kenlm_model = kenlm .Model (path_kenlm_model )
@@ -377,10 +377,10 @@ def modifying_documents(
377377 return document
378378
379379
380- class OscarModifyingDocuments :
381- def __init__ (self , lang_oscar_id ):
382- self .lang_oscar_id = lang_oscar_id
383- self .param = LoadParameters .load_parameters (lang_oscar_id )
380+ class FunctionDatasetModifyingDocuments :
381+ def __init__ (self , lang_dataset_id ):
382+ self .lang_dataset_id = lang_dataset_id
383+ self .param = LoadParameters .load_parameters (lang_dataset_id )
384384
385385 def __call__ (self , example ):
386386 example ["text" ] = ModifyingDocuments .modifying_documents (
@@ -400,7 +400,7 @@ def __call__(self, example):
400400 return example
401401
402402 def __reduce__ (self ):
403- return (self .__class__ , (self .lang_oscar_id ,))
403+ return (self .__class__ , (self .lang_dataset_id ,))
404404
405405
406406class Filtering :
@@ -569,28 +569,28 @@ def compute_lang_id_pred_score(document, model_lang_id):
569569 pred = model_lang_id .predict (document )
570570 lang_pred_fasttext_id = pred [0 ][0 ].replace ("__label__" , "" )
571571 score_pred = pred [1 ][0 ]
572- lang_pred_oscar_id = langs_id .loc [
573- langs_id ["fasttext_id" ] == lang_pred_fasttext_id , "oscar_id "
572+ lang_pred_dataset_id = langs_id .loc [
573+ langs_id ["fasttext_id" ] == lang_pred_fasttext_id , "dataset_id "
574574 ]
575- if len (lang_pred_oscar_id ) > 0 :
576- lang_pred_oscar_id = lang_pred_oscar_id .iloc [0 ]
575+ if len (lang_pred_dataset_id ) > 0 :
576+ lang_pred_dataset_id = lang_pred_dataset_id .iloc [0 ]
577577 else :
578- lang_pred_oscar_id = "unknown"
579- return lang_pred_oscar_id , score_pred
578+ lang_pred_dataset_id = "unknown"
579+ return lang_pred_dataset_id , score_pred
580580
581581 @staticmethod
582582 def check_lang_id (
583583 document ,
584- lang_oscar_id ,
584+ lang_dataset_id ,
585585 model_lang_id ,
586586 lang_id_min_cutoff ,
587587 ):
588588 cond = True
589589 if model_lang_id :
590- lang_pred_oscar_id , score_pred = Filtering .compute_lang_id_pred_score (
590+ lang_pred_dataset_id , score_pred = Filtering .compute_lang_id_pred_score (
591591 document , model_lang_id
592592 )
593- cond = (lang_pred_oscar_id == lang_oscar_id ) and (
593+ cond = (lang_pred_dataset_id == lang_dataset_id ) and (
594594 score_pred >= lang_id_min_cutoff
595595 )
596596 return cond
@@ -655,7 +655,7 @@ def filtering(
655655 badwords ,
656656 badwords_max_cutoff ,
657657 cond_check_lang_id ,
658- lang_oscar_id ,
658+ lang_dataset_id ,
659659 model_lang_id ,
660660 lang_id_min_cutoff ,
661661 cond_check_perplexity ,
@@ -706,7 +706,7 @@ def filtering(
706706 if cond_check_lang_id :
707707 if not Filtering .check_lang_id (
708708 document ,
709- lang_oscar_id ,
709+ lang_dataset_id ,
710710 model_lang_id ,
711711 lang_id_min_cutoff ,
712712 ):
@@ -722,33 +722,33 @@ def filtering(
722722 return True
723723
724724
725- class FuncOscarFiltering :
725+ class FunctionDatasetFiltering :
726726 def __init__ (
727727 self ,
728- lang_oscar_id ,
728+ lang_dataset_id ,
729729 path_fasttext_model ,
730730 path_sentencepiece_model ,
731731 path_kenlm_model ,
732732 ):
733- self .lang_oscar_id = lang_oscar_id
733+ self .lang_dataset_id = lang_dataset_id
734734 self .path_fasttext_model = path_fasttext_model
735735 self .path_sentencepiece_model = path_sentencepiece_model
736736 self .path_kenlm_model = path_kenlm_model
737737
738- self .param = LoadParameters .load_parameters (lang_oscar_id )
739- self .stopwords = LoadParameters .load_stopwords (lang_oscar_id )
740- self .badwords = LoadParameters .load_badwords (lang_oscar_id )
738+ self .param = LoadParameters .load_parameters (lang_dataset_id )
739+ self .stopwords = LoadParameters .load_stopwords (lang_dataset_id )
740+ self .badwords = LoadParameters .load_badwords (lang_dataset_id )
741741 self .model_lang_id = LoadParameters .load_model_lang_id (
742- lang_oscar_id , path_fasttext_model
742+ lang_dataset_id , path_fasttext_model
743743 )
744744 self .sentencepiece_model = LoadParameters .load_sentencepiece_model (
745- lang_oscar_id , path_sentencepiece_model
745+ lang_dataset_id , path_sentencepiece_model
746746 )
747747 self .sentencepiece_model_tok = (
748748 self .sentencepiece_model if self .param ["tokenization" ] else None
749749 )
750750 self .kenlm_model = LoadParameters .load_kenlm_model (
751- lang_oscar_id , path_kenlm_model
751+ lang_dataset_id , path_kenlm_model
752752 )
753753
754754 def __call__ (self , example ):
@@ -772,7 +772,7 @@ def __call__(self, example):
772772 badwords = self .badwords ,
773773 badwords_max_cutoff = self .param ["badwords_max_cutoff" ],
774774 cond_check_lang_id = self .param ["cond_check_lang_id" ],
775- lang_oscar_id = self .lang_oscar_id ,
775+ lang_dataset_id = self .lang_dataset_id ,
776776 model_lang_id = self .model_lang_id ,
777777 lang_id_min_cutoff = self .param ["lang_id_min_cutoff" ],
778778 cond_check_perplexity = self .param ["cond_check_perplexity" ],
@@ -786,50 +786,52 @@ def __reduce__(self):
786786 return (
787787 self .__class__ ,
788788 (
789- self .lang_oscar_id ,
789+ self .lang_dataset_id ,
790790 self .path_fasttext_model ,
791791 self .path_sentencepiece_model ,
792792 self .path_kenlm_model ,
793793 ),
794794 )
795795
796796
797- class OscarFiltering :
797+ class DatasetFiltering :
798798 def __init__ (
799799 self ,
800800 dataset ,
801- lang_oscar_id ,
801+ lang_dataset_id ,
802802 path_fasttext_model ,
803803 path_sentencepiece_model ,
804804 path_kenlm_model ,
805805 num_proc ,
806- path_dir_save_oscar ,
806+ path_dir_save_dataset ,
807807 ):
808808 self .ds = dataset
809- self .lang_oscar_id = lang_oscar_id
809+ self .lang_dataset_id = lang_dataset_id
810810 self .path_fasttext_model = path_fasttext_model
811811 self .path_sentencepiece_model = path_sentencepiece_model
812812 self .path_kenlm_model = path_kenlm_model
813813 self .num_proc = num_proc
814- self .path_dir_save_oscar = path_dir_save_oscar
814+ self .path_dir_save_dataset = path_dir_save_dataset
815815
816816 def modifying_documents (self ):
817- oscar_modifying_documents = OscarModifyingDocuments (self .lang_oscar_id )
818- self .ds = self .ds .map (oscar_modifying_documents , num_proc = self .num_proc )
817+ dataset_modifying_documents = FunctionDatasetModifyingDocuments (
818+ self .lang_dataset_id
819+ )
820+ self .ds = self .ds .map (dataset_modifying_documents , num_proc = self .num_proc )
819821
820822 def filtering (self ):
821- func_oscar_filtering = FuncOscarFiltering (
822- self .lang_oscar_id ,
823+ func_dataset_filtering = FunctionDatasetFiltering (
824+ self .lang_dataset_id ,
823825 self .path_fasttext_model ,
824826 self .path_sentencepiece_model ,
825827 self .path_kenlm_model ,
826828 )
827- self .ds = self .ds .filter (func_oscar_filtering , num_proc = self .num_proc )
829+ self .ds = self .ds .filter (func_dataset_filtering , num_proc = self .num_proc )
828830
829831 def save_dataset (self ):
830- pathlib .Path (self .path_dir_save_oscar ).mkdir (parents = True , exist_ok = True )
832+ pathlib .Path (self .path_dir_save_dataset ).mkdir (parents = True , exist_ok = True )
831833 path_dir_save_dataset = pathlib .PurePath (
832- self .path_dir_save_oscar , self .lang_oscar_id
834+ self .path_dir_save_dataset , self .lang_dataset_id
833835 )
834836 pathlib .Path (path_dir_save_dataset ).mkdir (parents = True , exist_ok = True )
835837 self .ds .save_to_disk (path_dir_save_dataset )
0 commit comments