@@ -171,9 +171,7 @@ def print_discared_by_cond(cond):
171171 if "10" in val_repetitions_lengths
172172 else 0
173173 )
174- label_selectbox = (
175- "Length of the repetitions (that will determine the repetitions ratio)."
176- )
174+ label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
177175 repetitions_length = st .sidebar .selectbox (
178176 label = label_selectbox ,
179177 options = val_repetitions_lengths ,
@@ -270,6 +268,7 @@ def print_discared_by_cond(cond):
270268 return keys , conds
271269
272270 self .keys , conds = set_sliders ()
271+ self .parameters = self .keys * 1
273272
274273 all_conds = [subcond for cond in list (conds .values ()) for subcond in cond ]
275274 all_conds = np .all (all_conds , axis = 0 )
@@ -356,10 +355,14 @@ def filtering_of_words(self):
356355 cutoff_def = "If the length of a word is higher than this number, the word is removed."
357356 max_len_word = min (int (np .max (self .words ["len_word" ])) + 1 , 200 )
358357 cutoff_word = st .sidebar .slider (cutoff_def , 0 , max_len_word , max_len_word )
358+ self .parameters .append (("len_word" , cutoff_word , True ))
359+ st .sidebar .caption ("---------" )
359360
360361 incorrect_substrings = st .sidebar .checkbox (
361362 "Remove words with incorrect substrings."
362363 )
364+ self .parameters .append (("incorrect_substrings" , incorrect_substrings ))
365+ st .sidebar .caption ("---------" )
363366
364367 cond_words = self .words ["len_word" ] <= cutoff_word
365368 if incorrect_substrings :
@@ -390,6 +393,13 @@ def filtering_of_words(self):
390393 )
391394 st .dataframe (retained_words )
392395
396+ def download_parameters (self ):
397+ btn = st .sidebar .download_button (
398+ label = "Download current parameters as json" ,
399+ data = json .dumps (self .parameters ),
400+ file_name = f"parameters_{ self .lang_dataset_id } .json" ,
401+ )
402+
393403 def plot_distributions_filtering_parameters (self ):
394404 st .header ("Distributions of the filtering parameters" )
395405
@@ -446,94 +456,109 @@ def analyse_personal_doc(self):
446456 is_discarded = False
447457
448458 def is_doc_discarded (key , score ):
449- if key [2 ]: # max cutoff
459+ if key [2 ]: # max cutoff
450460 return score > key [1 ]
451461 else :
452462 return score < key [1 ]
453463
454- st .markdown ("Statistics of the document:" )
455-
456- for key in self .keys :
457- if key [0 ] == "number_words" :
458- words = ModifyingDocuments .get_words_from_document (
459- personal_doc ,
460- self .sentencepiece_model_tok ,
461- lower_case = False ,
462- strip_characters = self .param ["strip_characters" ],
463- )
464- if key [2 ]:
465- st .markdown (f"Number of words: { len (words )} " )
466- if is_doc_discarded (key , len (words )):
467- is_discarded = True
468-
469- elif key [0 ] == "repetitions_ratio" :
470- repetitions_ratio = Filtering .compute_repetitions_ratio (personal_doc , int (key [3 ]))
471- repetitions_ratio = round (repetitions_ratio , 3 )
472- st .markdown (f"Repetitions ratio: { repetitions_ratio } " )
473- if is_doc_discarded (key , repetitions_ratio ):
474- is_discarded = True
475-
476- elif key [0 ] == "special_characters_ratio" :
477- special_characters_ratio = Filtering .compute_special_characters_ratio (
478- personal_doc , self .param ["special_characters" ]
479- )
480- special_characters_ratio = round (special_characters_ratio , 3 )
481- st .markdown (f"Special characters ratio: { special_characters_ratio } " )
482- if is_doc_discarded (key , special_characters_ratio ):
483- is_discarded = True
484-
485- elif key [0 ] == "stopwords_ratio" :
486- stopwords_ratio = Filtering .compute_stopwords_ratio (
487- personal_doc ,
488- self .sentencepiece_model_tok ,
489- self .param ["strip_characters" ],
490- self .param ["cond_words_augmentation" ],
491- self .param ["words_augmentation_group_sizes" ],
492- self .param ["words_augmentation_join_char" ],
493- self .stopwords ,
494- )
495- stopwords_ratio = round (stopwords_ratio , 3 )
496- st .markdown (f"Stop words ratio: { stopwords_ratio } " )
497- if is_doc_discarded (key , stopwords_ratio ):
498- is_discarded = True
499-
500- elif key [0 ] == "badwords_ratio" :
501- badwords_ratio = Filtering .compute_badwords_ratio (
502- personal_doc ,
503- self .sentencepiece_model_tok ,
504- self .param ["strip_characters" ],
505- self .param ["cond_words_augmentation" ],
506- self .param ["words_augmentation_group_sizes" ],
507- self .param ["words_augmentation_join_char" ],
508- self .badwords ,
509- )
510- badwords_ratio = round (badwords_ratio , 3 )
511- st .markdown (f"Flagged words ratio: { badwords_ratio } " )
512- if is_doc_discarded (key , badwords_ratio ):
513- is_discarded = True
514-
515- elif key [0 ] == "lang_id_score" :
516- lang_pred_dataset_id , lang_id_score = Filtering .compute_lang_id_pred_score (
517- personal_doc , self .model_lang_id
518- )
519- lang_id_score = round (lang_id_score , 3 )
520- st .markdown (f"Language identification confidence score: { lang_id_score } " )
521- if is_doc_discarded (key , badwords_ratio ) or (self .lang_dataset_id != lang_pred_dataset_id ):
522- is_discarded = True
523-
524- elif key [0 ] == "perplexity_score" :
525- perplexity_score = Filtering .compute_perplexity_score (
526- personal_doc ,
527- self .sentencepiece_model ,
528- self .kenlm_model ,
529- )
530- perplexity_score = round (perplexity_score , 3 )
531- st .markdown (f"Perplexity score: { perplexity_score } " )
532- if is_doc_discarded (key , perplexity_score ):
533- is_discarded = True
534-
535- is_discarded = "" if is_discarded else "not "
536- st .markdown (f"With the current filtering parameters, this document **is { is_discarded } discarded**." )
464+ if personal_doc :
465+
466+ st .markdown ("Statistics of the document:" )
467+
468+ for key in self .keys :
469+ if key [0 ] == "number_words" :
470+ words = ModifyingDocuments .get_words_from_document (
471+ personal_doc ,
472+ self .sentencepiece_model_tok ,
473+ lower_case = False ,
474+ strip_characters = self .param ["strip_characters" ],
475+ )
476+ if key [2 ]:
477+ st .markdown (f"Number of words: { len (words )} " )
478+ if is_doc_discarded (key , len (words )):
479+ is_discarded = True
480+
481+ elif key [0 ] == "repetitions_ratio" :
482+ repetitions_ratio = Filtering .compute_repetitions_ratio (
483+ personal_doc , int (key [3 ])
484+ )
485+ repetitions_ratio = round (repetitions_ratio , 3 )
486+ st .markdown (f"Repetitions ratio: { repetitions_ratio } " )
487+ if is_doc_discarded (key , repetitions_ratio ):
488+ is_discarded = True
489+
490+ elif key [0 ] == "special_characters_ratio" :
491+ special_characters_ratio = (
492+ Filtering .compute_special_characters_ratio (
493+ personal_doc , self .param ["special_characters" ]
494+ )
495+ )
496+ special_characters_ratio = round (special_characters_ratio , 3 )
497+ st .markdown (f"Special characters ratio: { special_characters_ratio } " )
498+ if is_doc_discarded (key , special_characters_ratio ):
499+ is_discarded = True
500+
501+ elif key [0 ] == "stopwords_ratio" :
502+ stopwords_ratio = Filtering .compute_stopwords_ratio (
503+ personal_doc ,
504+ self .sentencepiece_model_tok ,
505+ self .param ["strip_characters" ],
506+ self .param ["cond_words_augmentation" ],
507+ self .param ["words_augmentation_group_sizes" ],
508+ self .param ["words_augmentation_join_char" ],
509+ self .stopwords ,
510+ )
511+ stopwords_ratio = round (stopwords_ratio , 3 )
512+ st .markdown (f"Stop words ratio: { stopwords_ratio } " )
513+ if is_doc_discarded (key , stopwords_ratio ):
514+ is_discarded = True
515+
516+ elif key [0 ] == "badwords_ratio" :
517+ badwords_ratio = Filtering .compute_badwords_ratio (
518+ personal_doc ,
519+ self .sentencepiece_model_tok ,
520+ self .param ["strip_characters" ],
521+ self .param ["cond_words_augmentation" ],
522+ self .param ["words_augmentation_group_sizes" ],
523+ self .param ["words_augmentation_join_char" ],
524+ self .badwords ,
525+ )
526+ badwords_ratio = round (badwords_ratio , 3 )
527+ st .markdown (f"Flagged words ratio: { badwords_ratio } " )
528+ if is_doc_discarded (key , badwords_ratio ):
529+ is_discarded = True
530+
531+ elif key [0 ] == "lang_id_score" :
532+ (
533+ lang_pred_dataset_id ,
534+ lang_id_score ,
535+ ) = Filtering .compute_lang_id_pred_score (
536+ personal_doc , self .model_lang_id
537+ )
538+ lang_id_score = round (lang_id_score , 3 )
539+ st .markdown (
540+ f"Language identification confidence score: { lang_id_score } "
541+ )
542+ if is_doc_discarded (key , badwords_ratio ) or (
543+ self .lang_dataset_id != lang_pred_dataset_id
544+ ):
545+ is_discarded = True
546+
547+ elif key [0 ] == "perplexity_score" :
548+ perplexity_score = Filtering .compute_perplexity_score (
549+ personal_doc ,
550+ self .sentencepiece_model ,
551+ self .kenlm_model ,
552+ )
553+ perplexity_score = round (perplexity_score , 3 )
554+ st .markdown (f"Perplexity score: { perplexity_score } " )
555+ if is_doc_discarded (key , perplexity_score ):
556+ is_discarded = True
557+
558+ is_discarded = "" if is_discarded else "not "
559+ st .markdown (
560+ f"With the current filtering parameters, this document **is { is_discarded } discarded**."
561+ )
537562
538563 def download_data (self ):
539564 st .header ("Download data" )
@@ -546,12 +571,13 @@ def download_data(self):
546571 )
547572
548573 def visualization (self ):
549- #self.warning_preamble()
574+ # self.warning_preamble()
550575 self .preamble ()
551576 self .open_data ()
552577 self .set_title ()
553578 self .filtering_of_docs ()
554579 self .filtering_of_words ()
580+ self .download_parameters ()
555581 self .plot_distributions_filtering_parameters ()
556582 self .plot_zipf_law ()
557583 self .analyse_personal_doc ()
0 commit comments