@@ -123,16 +123,16 @@ def set_title(self):
123123
124124 @staticmethod
125125 def plot_hist (dataframe , key , num_bins = 50 ):
126- checkbox = st .sidebar . checkbox ("Diplay distribution" , key = f"display_distribution_{ key [0 ]} " )
126+ checkbox = st .checkbox ("Diplay distribution" , value = True , key = f"display_distribution_{ key [0 ]} " )
127127 if checkbox :
128128 fig , ax = plt .subplots ()
129129 val = dataframe [key [0 ]].values
130130 if np .median (val ) != 0 :
131- val = val [abs (val - np .median (val )) < 6 * np .median (np .absolute (val - np .median (val )))]
132- ax .hist (val , bins = num_bins )
131+ val = val [abs (val - np .median (val )) < 9 * np .median (np .absolute (val - np .median (val )))]
132+ ax .hist (val , bins = num_bins , density = True )
133133 ax .set_title (" " .join (key [0 ].split ("_" )))
134134 ax .axvline (x = key [1 ], color = 'r' , linestyle = 'dashed' )
135- st .sidebar . pyplot (fig )
135+ st .pyplot (fig )
136136
137137 def filtering_of_docs (self ):
138138 st .sidebar .subheader ("Parameters of the filtering on documents" )
@@ -148,142 +148,148 @@ def get_cond(key, cutoff, max_cutoff):
148148 return self .docs [key ] >= cutoff
149149
150150 def print_discared_by_cond (cond ):
151- st .sidebar . caption (
151+ st .caption (
152152 f"{ (len (cond ) - np .sum (1 * cond )) / len (cond ) * 100 :.2f} % of the total is discarded with this filter."
153153 )
154- st .sidebar .caption ("---------" )
155154
156155 if "number_words" in columns :
157- cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
158- max_nb_words = int (np .max (self .docs ["number_words" ])) + 1
159- cutoff_min_number_words = st .sidebar .slider (
160- cutoff_def , 0 , min (max_nb_words , 500 ), 0
161- )
162- new_key = ("number_words" , cutoff_min_number_words , False )
163- keys .append (new_key )
164- Visualization .plot_hist (self .docs , new_key )
165- cond_1 = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
166- print_discared_by_cond (cond_1 )
167-
168- cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
169- cutoff_max_number_words = st .sidebar .slider (
170- cutoff_def , 0 , max_nb_words , max_nb_words
171- )
172- new_key = ("number_words" , cutoff_max_number_words , True )
173- keys .append (new_key )
174- cond_2 = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
175- print_discared_by_cond (cond_2 )
156+ with st .sidebar .expander ("Number of words" ):
157+ cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
158+ max_nb_words = int (np .max (self .docs ["number_words" ])) + 1
159+ cutoff_min_number_words = st .slider (
160+ cutoff_def , 0 , min (max_nb_words , 500 ), 0
161+ )
162+ new_key = ("number_words" , cutoff_min_number_words , False )
163+ keys .append (new_key )
164+ Visualization .plot_hist (self .docs , new_key )
165+ cond_1 = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
166+ print_discared_by_cond (cond_1 )
167+
168+ cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
169+ cutoff_max_number_words = st .slider (
170+ cutoff_def , 0 , max_nb_words , max_nb_words
171+ )
172+ new_key = ("number_words" , cutoff_max_number_words , True )
173+ keys .append (new_key )
174+ cond_2 = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
175+ print_discared_by_cond (cond_2 )
176176
177- conds ["number_words" ] = [cond_1 , cond_2 ]
177+ conds ["number_words" ] = [cond_1 , cond_2 ]
178178
179179 if "repetitions_ratio" in columns :
180- val_repetitions_lengths = list (
181- self .docs ["repetitions_ratio" ].iloc [0 ].keys ()
182- )
183- default_index = (
184- val_repetitions_lengths .index ("10" )
185- if "10" in val_repetitions_lengths
186- else 0
187- )
188- label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
189- repetitions_length = st .sidebar .selectbox (
190- label = label_selectbox ,
191- options = val_repetitions_lengths ,
192- index = default_index ,
193- )
194- st .sidebar .caption (
195- "Choosing a higher or lower number does not mean that the filtering "
196- "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
197- "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
198- "few or no repetitions, simply because their length gives them more diversity, and we do "
199- "not want to discard such documents."
200- )
201- self .docs = self .docs_checkpoint
202- for i in range (len (self .docs ["repetitions_ratio" ])):
203- self .docs ["repetitions_ratio" ].iloc [i ] = self .docs [
204- "repetitions_ratio"
205- ].iloc [i ][repetitions_length ]
206-
207- cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
208- cutoff_repetitions_ratio = st .sidebar .slider (
209- cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
210- )
211- new_key = (
212- "repetitions_ratio" ,
213- cutoff_repetitions_ratio ,
214- True ,
215- repetitions_length ,
216- )
217- keys .append (new_key )
218- Visualization .plot_hist (self .docs , new_key )
219- cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
220- print_discared_by_cond (cond )
221- conds ["repetitions_ratio" ] = [cond ]
180+ with st .sidebar .expander ("Repetitions ratio" ):
181+ val_repetitions_lengths = list (
182+ self .docs ["repetitions_ratio" ].iloc [0 ].keys ()
183+ )
184+ default_index = (
185+ val_repetitions_lengths .index ("10" )
186+ if "10" in val_repetitions_lengths
187+ else 0
188+ )
189+ label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
190+ repetitions_length = st .selectbox (
191+ label = label_selectbox ,
192+ options = val_repetitions_lengths ,
193+ index = default_index ,
194+ )
195+ st .caption (
196+ "Choosing a higher or lower number does not mean that the filtering "
197+ "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
198+ "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
199+ "few or no repetitions, simply because their length gives them more diversity, and we do "
200+ "not want to discard such documents."
201+ )
202+ self .docs = self .docs_checkpoint
203+ for i in range (len (self .docs ["repetitions_ratio" ])):
204+ self .docs ["repetitions_ratio" ].iloc [i ] = self .docs [
205+ "repetitions_ratio"
206+ ].iloc [i ][repetitions_length ]
207+
208+ cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
209+ cutoff_repetitions_ratio = st .slider (
210+ cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
211+ )
212+ new_key = (
213+ "repetitions_ratio" ,
214+ cutoff_repetitions_ratio ,
215+ True ,
216+ repetitions_length ,
217+ )
218+ keys .append (new_key )
219+ Visualization .plot_hist (self .docs , new_key )
220+ cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
221+ print_discared_by_cond (cond )
222+ conds ["repetitions_ratio" ] = [cond ]
222223
223224 if "special_characters_ratio" in columns :
224- cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
225- cutoff_special_characters_ratio = st .sidebar .slider (
226- cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
227- )
228- new_key = (
229- "special_characters_ratio" ,
230- cutoff_special_characters_ratio ,
231- True ,
232- )
233- keys .append (new_key )
234- Visualization .plot_hist (self .docs , new_key )
235- cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
236- print_discared_by_cond (cond )
237- conds ["special_characters_ratio" ] = [cond ]
225+ with st .sidebar .expander ("Special characters ratio" ):
226+ cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
227+ cutoff_special_characters_ratio = st .slider (
228+ cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
229+ )
230+ new_key = (
231+ "special_characters_ratio" ,
232+ cutoff_special_characters_ratio ,
233+ True ,
234+ )
235+ keys .append (new_key )
236+ Visualization .plot_hist (self .docs , new_key )
237+ cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
238+ print_discared_by_cond (cond )
239+ conds ["special_characters_ratio" ] = [cond ]
238240
239241 if "stopwords_ratio" in columns :
240- cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
241- cutoff_stopwords_ratio = st .sidebar .slider (
242- cutoff_def , 0.0 , 1.0 , 0.0 , step = 0.01
243- )
244- new_key = ("stopwords_ratio" , cutoff_stopwords_ratio , False )
245- keys .append (new_key )
246- Visualization .plot_hist (self .docs , new_key )
247- cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
248- print_discared_by_cond (cond )
249- conds ["stopwords_ratio" ] = [cond ]
242+ with st .sidebar .expander ("Stop words ratio" ):
243+ cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
244+ cutoff_stopwords_ratio = st .slider (
245+ cutoff_def , 0.0 , 1.0 , 0.0 , step = 0.01
246+ )
247+ new_key = ("stopwords_ratio" , cutoff_stopwords_ratio , False )
248+ keys .append (new_key )
249+ Visualization .plot_hist (self .docs , new_key )
250+ cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
251+ print_discared_by_cond (cond )
252+ conds ["stopwords_ratio" ] = [cond ]
250253
251254 if "flagged_words_ratio" in columns :
252- cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
253- cutoff_flagged_words_ratio = st .sidebar .slider (
254- cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
255- )
256- new_key = ("flagged_words_ratio" , cutoff_flagged_words_ratio , True )
257- keys .append (new_key )
258- Visualization .plot_hist (self .docs , new_key )
259- cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
260- print_discared_by_cond (cond )
261- conds ["flagged_words_ratio" ] = [cond ]
255+ with st .sidebar .expander ("Flagged words ratio" ):
256+ cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
257+ cutoff_flagged_words_ratio = st .slider (
258+ cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
259+ )
260+ new_key = ("flagged_words_ratio" , cutoff_flagged_words_ratio , True )
261+ keys .append (new_key )
262+ Visualization .plot_hist (self .docs , new_key )
263+ cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
264+ print_discared_by_cond (cond )
265+ conds ["flagged_words_ratio" ] = [cond ]
262266
263267 if "lang_id_score" in columns :
264- cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
265- cutoff_lang_id_score = st .sidebar .slider (
266- cutoff_def , 0.0 , 1.0 , 0.0 , step = 0.01
267- )
268- new_key = ("lang_id_score" , cutoff_lang_id_score , False )
269- keys .append (new_key )
270- Visualization .plot_hist (self .docs , new_key )
271- cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
272- print_discared_by_cond (cond )
273- conds ["lang_id_score" ] = [cond ]
268+ with st .sidebar .expander ("Language ID confidence score" ):
269+ cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
270+ cutoff_lang_id_score = st .slider (
271+ cutoff_def , 0.0 , 1.0 , 0.0 , step = 0.01
272+ )
273+ new_key = ("lang_id_score" , cutoff_lang_id_score , False )
274+ keys .append (new_key )
275+ Visualization .plot_hist (self .docs , new_key )
276+ cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
277+ print_discared_by_cond (cond )
278+ conds ["lang_id_score" ] = [cond ]
274279
275280 if "perplexity_score" in columns :
276- cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
277- max_pp = int (np .max (self .docs ["perplexity_score" ])) + 1
278- cutoff_perplexity_score = st .sidebar .slider (
279- cutoff_def , 0 , max_pp , max_pp
280- )
281- new_key = ("perplexity_score" , cutoff_perplexity_score , True )
282- keys .append (new_key )
283- Visualization .plot_hist (self .docs , new_key )
284- cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
285- print_discared_by_cond (cond )
286- conds ["perplexity_score" ] = [cond ]
281+ with st .sidebar .expander ("Perplexity score" ):
282+ cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
283+ max_pp = int (np .max (self .docs ["perplexity_score" ])) + 1
284+ cutoff_perplexity_score = st .slider (
285+ cutoff_def , 0 , max_pp , max_pp
286+ )
287+ new_key = ("perplexity_score" , cutoff_perplexity_score , True )
288+ keys .append (new_key )
289+ Visualization .plot_hist (self .docs , new_key )
290+ cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
291+ print_discared_by_cond (cond )
292+ conds ["perplexity_score" ] = [cond ]
287293
288294 return keys , conds
289295
@@ -372,23 +378,23 @@ def filtering_of_words(self):
372378 if not (self .words is None ):
373379 st .sidebar .subheader ("Parameter of the filtering on words" )
374380
375- cutoff_def = "If the length of a word is higher than this number, the word is removed."
376- max_len_word = min ( int ( np . max ( self . words [ "len_word" ])) + 1 , 200 )
377- cutoff_word = st . sidebar . slider ( cutoff_def , 0 , max_len_word , max_len_word )
378- new_key = ( "len_word" , cutoff_word , True )
379- self . parameters . append ( new_key )
380- Visualization . plot_hist ( self .words , new_key )
381- st . sidebar . caption ( "---------" )
382-
383- incorrect_substrings = st .sidebar .checkbox (
384- "Remove words with incorrect substrings."
385- )
386- self . parameters . append (( "incorrect_substrings" , incorrect_substrings ) )
387- st . sidebar . caption ( "---------" )
381+ with st . sidebar . expander ( "Length of words" ):
382+ cutoff_def = "If the length of a word is higher than this number, the word is removed."
383+ max_len_word = min ( int ( np . max ( self . words [ "len_word" ])) + 1 , 200 )
384+ cutoff_word = st . slider ( cutoff_def , 0 , max_len_word , max_len_word )
385+ new_key = ( "len_word" , cutoff_word , True )
386+ self .parameters . append ( new_key )
387+ Visualization . plot_hist ( self . words , new_key )
388+
389+ with st .sidebar .expander ( "Words with incorrect substrings" ):
390+ incorrect_substrings = st . checkbox (
391+ "Remove words with incorrect substrings."
392+ )
393+ self . parameters . append (( "incorrect_substrings" , incorrect_substrings ) )
388394
389- cond_words = self .words ["len_word" ] <= cutoff_word
390- if incorrect_substrings :
391- cond_words = cond_words & np .invert (self .words ["incorrect_substring" ])
395+ cond_words = self .words ["len_word" ] <= cutoff_word
396+ if incorrect_substrings :
397+ cond_words = cond_words & np .invert (self .words ["incorrect_substring" ])
392398
393399 st .header ("Filtering on words" )
394400
@@ -416,6 +422,7 @@ def filtering_of_words(self):
416422 st .dataframe (retained_words )
417423
418424 def download_parameters (self ):
425+ st .sidebar .subheader ("Download parameters" )
419426 btn = st .sidebar .download_button (
420427 label = "Download current parameters as json" ,
421428 data = json .dumps (self .parameters ),
0 commit comments