@@ -120,6 +120,12 @@ def open_data(self):
120120 def set_title (self ):
121121 st .title (f"Filtering visualization" )
122122
123+ @staticmethod
124+ def print_discarded_by_cond (cond ):
125+ st .caption (
126+ f"{ (len (cond ) - np .sum (1 * cond )) / len (cond ) * 100 :.2f} % of the total is discarded with this filter."
127+ )
128+
123129 @staticmethod
124130 def plot_hist (dataframe , key , num_bins = 50 ):
125131 checkbox = st .checkbox (
@@ -138,6 +144,17 @@ def plot_hist(dataframe, key, num_bins=50):
138144 ax .axvline (x = key [1 ], color = "r" , linestyle = "dashed" )
139145 st .pyplot (fig )
140146
147+ @staticmethod
148+ def display_dataset (dataframe , cond , description , type_of_examples ):
149+ displayed_examples = dataframe .loc [cond ]
150+ st .subheader (
151+ f"{ description } : { len (displayed_examples )} { type_of_examples } ({ len (displayed_examples ) / len (dataframe .index ) * 100 :.2f} %)"
152+ )
153+ st .markdown (
154+ "Click on a column to sort by it, place the cursor on the text to display it."
155+ )
156+ st .dataframe (displayed_examples )
157+
141158 def filtering_of_docs (self ):
142159 st .sidebar .subheader ("Parameters of the filtering on documents" )
143160
@@ -151,11 +168,6 @@ def get_cond(key, cutoff, max_cutoff):
151168 return self .docs [key ] <= cutoff
152169 return self .docs [key ] >= cutoff
153170
154- def print_discared_by_cond (cond ):
155- st .caption (
156- f"{ (len (cond ) - np .sum (1 * cond )) / len (cond ) * 100 :.2f} % of the total is discarded with this filter."
157- )
158-
159171 if "number_words" in columns :
160172 with st .sidebar .expander ("Number of words" ):
161173 cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
@@ -167,7 +179,7 @@ def print_discared_by_cond(cond):
167179 keys .append (new_key )
168180 Visualization .plot_hist (self .docs , new_key )
169181 cond_1 = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
170- print_discared_by_cond (cond_1 )
182+ Visualization . print_discarded_by_cond (cond_1 )
171183
172184 cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
173185 cutoff_max_number_words = st .slider (
@@ -176,7 +188,7 @@ def print_discared_by_cond(cond):
176188 new_key = ("number_words" , cutoff_max_number_words , True )
177189 keys .append (new_key )
178190 cond_2 = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
179- print_discared_by_cond (cond_2 )
191+ Visualization . print_discarded_by_cond (cond_2 )
180192
181193 conds ["number_words" ] = [cond_1 , cond_2 ]
182194
@@ -224,7 +236,7 @@ def print_discared_by_cond(cond):
224236 keys .append (new_key )
225237 Visualization .plot_hist (self .docs , new_key )
226238 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
227- print_discared_by_cond (cond )
239+ Visualization . print_discarded_by_cond (cond )
228240 conds ["repetitions_ratio" ] = [cond ]
229241
230242 if "special_characters_ratio" in columns :
@@ -241,7 +253,7 @@ def print_discared_by_cond(cond):
241253 keys .append (new_key )
242254 Visualization .plot_hist (self .docs , new_key )
243255 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
244- print_discared_by_cond (cond )
256+ Visualization . print_discarded_by_cond (cond )
245257 conds ["special_characters_ratio" ] = [cond ]
246258
247259 if "stopwords_ratio" in columns :
@@ -277,7 +289,7 @@ def print_discared_by_cond(cond):
277289 keys .append (new_key )
278290 Visualization .plot_hist (self .docs , new_key )
279291 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
280- print_discared_by_cond (cond )
292+ Visualization . print_discarded_by_cond (cond )
281293 conds ["stopwords_ratio" ] = [cond ]
282294
283295 if "flagged_words_ratio" in columns :
@@ -313,7 +325,7 @@ def print_discared_by_cond(cond):
313325 keys .append (new_key )
314326 Visualization .plot_hist (self .docs , new_key )
315327 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
316- print_discared_by_cond (cond )
328+ Visualization . print_discarded_by_cond (cond )
317329 conds ["flagged_words_ratio" ] = [cond ]
318330
319331 if "lang_id_score" in columns :
@@ -326,7 +338,7 @@ def print_discared_by_cond(cond):
326338 keys .append (new_key )
327339 Visualization .plot_hist (self .docs , new_key )
328340 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
329- print_discared_by_cond (cond )
341+ Visualization . print_discarded_by_cond (cond )
330342 conds ["lang_id_score" ] = [cond ]
331343
332344 if "perplexity_score" in columns :
@@ -338,7 +350,7 @@ def print_discared_by_cond(cond):
338350 keys .append (new_key )
339351 Visualization .plot_hist (self .docs , new_key )
340352 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
341- print_discared_by_cond (cond )
353+ Visualization . print_discarded_by_cond (cond )
342354 conds ["perplexity_score" ] = [cond ]
343355
344356 return keys , conds
@@ -356,17 +368,7 @@ def print_discared_by_cond(cond):
356368 f"Filtering on documents, for { self .num_docs } { self .lang } documents"
357369 )
358370
359- def display_dataset (cond , description ):
360- displayed_docs = self .docs .loc [cond ]
361- st .subheader (
362- f"{ description } : { len (displayed_docs )} docs ({ len (displayed_docs ) / self .num_docs * 100 :.2f} %)"
363- )
364- st .markdown (
365- "Click on a column to sort by it, place the cursor on the text to display it."
366- )
367- st .dataframe (displayed_docs )
368-
369- display_dataset (np .invert (all_conds ), "Discarded documents" )
371+ Visualization .display_dataset (self .docs , np .invert (all_conds ), "Discarded documents" , "docs" )
370372
371373 # st.subheader("Display discarded documents by filter")
372374 display_discarded_documents_by_filter = st .checkbox (
@@ -378,58 +380,37 @@ def display_dataset(cond, description):
378380
379381 if "number_words" in columns :
380382 cond_filter = np .invert (np .all (conds ["number_words" ], axis = 0 ))
381- display_dataset (
382- cond_filter ,
383- "Discarded documents for the filter on the number of words" ,
384- )
383+ Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the number of words" , "docs" )
385384
386385 if "repetitions_ratio" in columns :
387386 cond_filter = np .invert (np .all (conds ["repetitions_ratio" ], axis = 0 ))
388- display_dataset (
389- cond_filter ,
390- "Discarded documents for the filter on the repetitions ratio" ,
391- )
387+ Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the repetitions ratio" , "docs" )
392388
393389 if "special_characters_ratio" in columns :
394390 cond_filter = np .invert (
395391 np .all (conds ["special_characters_ratio" ], axis = 0 )
396392 )
397- display_dataset (
398- cond_filter ,
399- "Discarded documents for the filter on the special characters ratio" ,
400- )
393+ Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the special characters ratio" , "docs" )
401394
402395 if "stopwords_ratio" in columns :
403396 cond_filter = np .invert (np .all (conds ["stopwords_ratio" ], axis = 0 ))
404- display_dataset (
405- cond_filter ,
406- "Discarded documents for the filter on the stop words ratio" ,
407- )
397+ Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the stop words ratio" , "docs" )
408398
409399 if "flagged_words_ratio" in columns :
410400 cond_filter = np .invert (
411401 np .all (conds ["flagged_words_ratio" ], axis = 0 )
412402 )
413- display_dataset (
414- cond_filter ,
415- "Discarded documents for the filter on the flagged words ratio" ,
416- )
403+ Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the flagged words ratio" , "docs" )
417404
418405 if "lang_id_score" in columns :
419406 cond_filter = np .invert (np .all (conds ["lang_id_score" ], axis = 0 ))
420- display_dataset (
421- cond_filter ,
422- "Discarded documents for the filter on the language identification confidence score" ,
423- )
407+ Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the language identification confidence score" , "docs" )
424408
425409 if "perplexity_score" in columns :
426410 cond_filter = np .invert (np .all (conds ["perplexity_score" ], axis = 0 ))
427- display_dataset (
428- cond_filter ,
429- "Discarded documents for the filter on the perplexity score" ,
430- )
411+ Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the perplexity score" , "docs" )
431412
432- display_dataset (all_conds , "Retained documents" )
413+ Visualization . display_dataset (self . docs , all_conds , "Retained documents" , "docs " )
433414
434415 st .header ("Download data" )
435416
@@ -442,57 +423,70 @@ def display_dataset(cond, description):
442423
443424 def filtering_of_words (self ):
444425 if not (self .words is None ):
445- st . sidebar . subheader ( "Parameter of the filtering on words" )
426+ columns = list ( self . words )
446427
447- with st .sidebar .expander ("Length of words" ):
448- cutoff_def = "If the length of a word is higher than this number, the word is removed."
449- max_len_word = min (int (np .max (self .words ["len_word" ])) + 1 , 200 )
450- cutoff_word = st .slider (cutoff_def , 0 , max_len_word , max_len_word )
451- new_key = ("len_word" , cutoff_word , True )
452- self .parameters .append (new_key )
453- Visualization .plot_hist (self .words , new_key )
454-
455- with st .sidebar .expander ("Words with incorrect substrings" ):
456- incorrect_substrings = st .checkbox (
457- "Remove words with incorrect substrings."
458- )
459- self .parameters .append (("incorrect_substrings" , incorrect_substrings ))
428+ st .sidebar .subheader ("Parameter of the filtering on words" )
460429
461- cond_words = self .words ["len_word" ] <= cutoff_word
462- if incorrect_substrings :
463- cond_words = cond_words & np .invert (
464- self .words ["incorrect_substring" ]
430+ conds_words = {}
431+
432+ if "len_word" in columns :
433+ with st .sidebar .expander ("Length of words" ):
434+ cutoff_def = "If the length of a word is higher than this number, the word is removed."
435+ max_len_word = min (int (np .max (self .words ["len_word" ])) + 1 , 200 )
436+ cutoff_word = st .slider (cutoff_def , 0 , max_len_word , max_len_word )
437+ new_key = ("len_word" , cutoff_word , True )
438+ self .parameters .append (new_key )
439+ Visualization .plot_hist (self .words , new_key )
440+ cond_len_words = self .words ["len_word" ] <= cutoff_word
441+ Visualization .print_discarded_by_cond (cond_len_words )
442+ conds_words ["len_word" ] = cond_len_words
443+
444+ if "incorrect_substrings" in columns :
445+ with st .sidebar .expander ("Words with incorrect substrings" ):
446+ incorrect_substrings = st .checkbox (
447+ "Remove words with incorrect substrings."
465448 )
449+ self .parameters .append (("incorrect_substrings" , incorrect_substrings ))
450+
451+ if incorrect_substrings :
452+ cond_incorrect_substrings = np .invert (self .words ["incorrect_substrings" ])
453+ else :
454+ cond_incorrect_substrings = np .array ([True for i in range (len (self .words ["incorrect_substrings" ]))])
455+ Visualization .print_discarded_by_cond (cond_incorrect_substrings )
456+ conds_words ["incorrect_substrings" ] = cond_incorrect_substrings
457+
458+ all_conds_words = np .all (list (conds_words .values ()), axis = 0 )
466459
467460 with st .expander (
468- f"Filtering on words, for { self .num_docs } { self .lang } documents"
461+ f"Filtering on words, for { self .num_docs_for_words } { self .lang } documents"
469462 ):
470463 st .header (
471- f"Filtering on words, for { self .num_docs } { self .lang } documents"
464+ f"Filtering on words, for { self .num_docs_for_words } { self .lang } documents"
472465 )
473466
474467 st .markdown (
475468 f"Since the number of words is way larger than the number of documents, "
476- f"we consider in this section words for the first { self .num_docs_for_words } documents only ."
469+ f"we consider in this section words for only { self .num_docs_for_words } documents."
477470 )
478471
479- discarded_words = self .words .loc [np .invert (cond_words )]
480- st .subheader (
481- f"Discarded words: { len (discarded_words )} words ({ len (discarded_words ) / len (self .words ) * 100 :.2f} %)"
482- )
483- st .markdown (
484- "Click on a column to sort by it, place the cursor on the text to display it."
485- )
486- st .dataframe (discarded_words )
472+ Visualization .display_dataset (self .words , np .invert (all_conds_words ), "Discarded words" , "words" )
487473
488- retained_words = self .words .loc [cond_words ]
489- st .subheader (
490- f"Retained words: { len (retained_words )} words ({ len (retained_words ) / len (self .words ) * 100 :.2f} %)"
491- )
492- st .markdown (
493- "Click on a column to sort by it, place the cursor on the text to display it."
474+ # st.subheader("Display discarded words by filter")
475+ display_discarded_words_by_filter = st .checkbox (
476+ "Display discarded words by filter"
494477 )
495- st .dataframe (retained_words )
478+
479+ if display_discarded_words_by_filter :
480+
481+ if "len_word" in columns :
482+ cond_filter = np .invert (conds_words ["len_word" ])
483+ Visualization .display_dataset (self .words , cond_filter , "Discarded words for the filter on length" , "words" )
484+
485+ if "incorrect_substrings" in columns :
486+ cond_filter = np .invert (conds_words ["incorrect_substrings" ])
487+ Visualization .display_dataset (self .words , cond_filter , "Discarded words for the filter on incorrect substrings" , "words" )
488+
489+ Visualization .display_dataset (self .words , all_conds_words , "Retained words" , "words" )
496490
497491 def download_parameters (self ):
498492 st .sidebar .subheader ("Download parameters" )
0 commit comments