77import base64
88import json
99import pandas as pd
10+
1011pd .options .mode .chained_assignment = None
1112
1213import numpy as np
1314
1415import matplotlib .pyplot as plt
1516
17+ import sys
18+ from pathlib import Path
19+
20+ sys .path .insert (1 , os .path .join (sys .path [0 ], ".." ))
21+ # Append the path of the ac_dc directory to the python path
22+ # to find the file filtering.py in the parent directory
23+ sys .path .append (str (Path (sys .path [0 ]).parent .absolute ().parent .absolute ()))
24+
25+ from filtering import LoadParameters , ModifyingDocuments , Filtering
26+
1627
1728class Visualization :
1829 def __init__ (
@@ -23,6 +34,10 @@ def __init__(
2334 num_docs ,
2435 num_docs_for_words ,
2536 max_len_text_display ,
37+ lang_dataset_id ,
38+ path_fasttext_model ,
39+ path_sentencepiece_model ,
40+ path_kenlm_model ,
2641 ):
2742 self .path_instructions = path_instructions
2843 self .path_data = path_data
@@ -31,6 +46,32 @@ def __init__(
3146 self .num_docs_for_words = num_docs_for_words
3247 self .max_len_text_display = max_len_text_display
3348
49+ self .lang_dataset_id = lang_dataset_id
50+ self .param = LoadParameters .load_parameters (lang_dataset_id )
51+ self .stopwords = LoadParameters .load_stopwords (lang_dataset_id )
52+ self .badwords = LoadParameters .load_badwords (lang_dataset_id )
53+ self .model_lang_id = LoadParameters .load_model_lang_id (
54+ lang_dataset_id , path_fasttext_model
55+ )
56+ self .sentencepiece_model = LoadParameters .load_sentencepiece_model (
57+ lang_dataset_id , path_sentencepiece_model
58+ )
59+ self .sentencepiece_model_tok = (
60+ self .sentencepiece_model if self .param ["tokenization" ] else None
61+ )
62+ self .kenlm_model = LoadParameters .load_kenlm_model (
63+ lang_dataset_id , path_kenlm_model
64+ )
65+
66+ def warning_preamble (self ):
67+ st .markdown (
68+ "This demo can be a little slow, and only allows you to process up to 5000 documents "
69+ "for a decent speed. If you want to display up to three times more documents and have "
70+ "a faster visualization, we invite you to run this "
71+ "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
72+ "on your computer."
73+ )
74+
3475 def preamble (self ):
3576 st .markdown (
3677 "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
@@ -131,21 +172,25 @@ def print_discared_by_cond(cond):
131172 else 0
132173 )
133174 label_selectbox = (
134- "Length of the repetitions (that will determine the repetitions ratio). "
135- "Choosing a higher or lower number does not mean that the filtering "
136- "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
137- "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
138- "few or no repetitions, simply because their length gives them more diversity, and we do "
139- "not want to discard such documents."
175+ "Length of the repetitions (that will determine the repetitions ratio)."
140176 )
141177 repetitions_length = st .sidebar .selectbox (
142178 label = label_selectbox ,
143179 options = val_repetitions_lengths ,
144180 index = default_index ,
145181 )
182+ st .sidebar .caption (
183+ "Choosing a higher or lower number does not mean that the filtering "
184+ "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
185+ "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
186+ "few or no repetitions, simply because their length gives them more diversity, and we do "
187+ "not want to discard such documents."
188+ )
146189 self .docs = self .docs_checkpoint
147190 for i in range (len (self .docs ["repetitions_ratio" ])):
148- self .docs ["repetitions_ratio" ].iloc [i ] = self .docs ["repetitions_ratio" ].iloc [i ][repetitions_length ]
191+ self .docs ["repetitions_ratio" ].iloc [i ] = self .docs [
192+ "repetitions_ratio"
193+ ].iloc [i ][repetitions_length ]
149194
150195 cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
151196 cutoff_repetitions_ratio = st .sidebar .slider (
@@ -155,6 +200,7 @@ def print_discared_by_cond(cond):
155200 "repetitions_ratio" ,
156201 cutoff_repetitions_ratio ,
157202 True ,
203+ repetitions_length ,
158204 )
159205 keys .append (new_key )
160206 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
@@ -388,6 +434,107 @@ def plot_zipf_law(self):
388434 ax .set_ylabel ("frequency in the documents" )
389435 st .pyplot (fig )
390436
437+ def analyse_personal_doc (self ):
438+ st .header ("Analyse your own document" )
439+
440+ personal_doc = st .text_area (
441+ label = "Paste here the document you want to analyse" ,
442+ value = "" ,
443+ max_chars = 10000 ,
444+ )
445+
446+ is_discarded = False
447+
448+ def is_doc_discarded (key , score ):
449+ if key [2 ]: # max cutoff
450+ return score > key [1 ]
451+ else :
452+ return score < key [1 ]
453+
454+ st .markdown ("Statistics of the document:" )
455+
456+ for key in self .keys :
457+ if key [0 ] == "number_words" :
458+ words = ModifyingDocuments .get_words_from_document (
459+ personal_doc ,
460+ self .sentencepiece_model_tok ,
461+ lower_case = False ,
462+ strip_characters = self .param ["strip_characters" ],
463+ )
464+ if key [2 ]:
465+ st .markdown (f"Number of words: { len (words )} " )
466+ if is_doc_discarded (key , len (words )):
467+ is_discarded = True
468+
469+ elif key [0 ] == "repetitions_ratio" :
470+ repetitions_ratio = Filtering .compute_repetitions_ratio (personal_doc , int (key [3 ]))
471+ repetitions_ratio = round (repetitions_ratio , 3 )
472+ st .markdown (f"Repetitions ratio: { repetitions_ratio } " )
473+ if is_doc_discarded (key , repetitions_ratio ):
474+ is_discarded = True
475+
476+ elif key [0 ] == "special_characters_ratio" :
477+ special_characters_ratio = Filtering .compute_special_characters_ratio (
478+ personal_doc , self .param ["special_characters" ]
479+ )
480+ special_characters_ratio = round (special_characters_ratio , 3 )
481+ st .markdown (f"Special characters ratio: { special_characters_ratio } " )
482+ if is_doc_discarded (key , special_characters_ratio ):
483+ is_discarded = True
484+
485+ elif key [0 ] == "stopwords_ratio" :
486+ stopwords_ratio = Filtering .compute_stopwords_ratio (
487+ personal_doc ,
488+ self .sentencepiece_model_tok ,
489+ self .param ["strip_characters" ],
490+ self .param ["cond_words_augmentation" ],
491+ self .param ["words_augmentation_group_sizes" ],
492+ self .param ["words_augmentation_join_char" ],
493+ self .stopwords ,
494+ )
495+ stopwords_ratio = round (stopwords_ratio , 3 )
496+ st .markdown (f"Stop words ratio: { stopwords_ratio } " )
497+ if is_doc_discarded (key , stopwords_ratio ):
498+ is_discarded = True
499+
500+ elif key [0 ] == "badwords_ratio" :
501+ badwords_ratio = Filtering .compute_badwords_ratio (
502+ personal_doc ,
503+ self .sentencepiece_model_tok ,
504+ self .param ["strip_characters" ],
505+ self .param ["cond_words_augmentation" ],
506+ self .param ["words_augmentation_group_sizes" ],
507+ self .param ["words_augmentation_join_char" ],
508+ self .badwords ,
509+ )
510+ badwords_ratio = round (badwords_ratio , 3 )
511+ st .markdown (f"Flagged words ratio: { badwords_ratio } " )
512+ if is_doc_discarded (key , badwords_ratio ):
513+ is_discarded = True
514+
515+ elif key [0 ] == "lang_id_score" :
516+ lang_pred_dataset_id , lang_id_score = Filtering .compute_lang_id_pred_score (
517+ personal_doc , self .model_lang_id
518+ )
519+ lang_id_score = round (lang_id_score , 3 )
520+ st .markdown (f"Language identification confidence score: { lang_id_score } " )
521+ if is_doc_discarded (key , badwords_ratio ) or (self .lang_dataset_id != lang_pred_dataset_id ):
522+ is_discarded = True
523+
524+ elif key [0 ] == "perplexity_score" :
525+ perplexity_score = Filtering .compute_perplexity_score (
526+ personal_doc ,
527+ self .sentencepiece_model ,
528+ self .kenlm_model ,
529+ )
530+ perplexity_score = round (perplexity_score , 3 )
531+ st .markdown (f"Perplexity score: { perplexity_score } " )
532+ if is_doc_discarded (key , perplexity_score ):
533+ is_discarded = True
534+
535+ is_discarded = "" if is_discarded else "not "
536+ st .markdown (f"With the current filtering parameters, this document **is { is_discarded } discarded**." )
537+
391538 def download_data (self ):
392539 st .header ("Download data" )
393540
@@ -399,13 +546,15 @@ def download_data(self):
399546 )
400547
401548 def visualization (self ):
549+ #self.warning_preamble()
402550 self .preamble ()
403551 self .open_data ()
404552 self .set_title ()
405553 self .filtering_of_docs ()
406554 self .filtering_of_words ()
407555 self .plot_distributions_filtering_parameters ()
408556 self .plot_zipf_law ()
557+ self .analyse_personal_doc ()
409558 self .download_data ()
410559
411560
@@ -416,12 +565,22 @@ def visualization(self):
416565num_docs_for_words = 1500
417566max_len_text_display = 10000
418567
568+ # Only useful for analyse_personal_doc
569+ lang_dataset_id = "en"
570+ path_fasttext_model = "./ac_dc/lid.176.bin"
571+ path_sentencepiece_model = "./ac_dc/en.sp.model"
572+ path_kenlm_model = "./ac_dc/en.arpa.bin"
573+
419574visualization = Visualization (
420575 path_instructions ,
421576 path_data ,
422577 lang ,
423578 num_docs ,
424579 num_docs_for_words ,
425580 max_len_text_display ,
581+ lang_dataset_id ,
582+ path_fasttext_model ,
583+ path_sentencepiece_model ,
584+ path_kenlm_model ,
426585)
427586visualization .visualization ()
0 commit comments