@@ -368,7 +368,9 @@ def get_cond(key, cutoff, max_cutoff):
368368 f"Filtering on documents, for { self .num_docs } { self .lang } documents"
369369 )
370370
371- Visualization .display_dataset (self .docs , np .invert (all_conds ), "Discarded documents" , "docs" )
371+ Visualization .display_dataset (
372+ self .docs , np .invert (all_conds ), "Discarded documents" , "docs"
373+ )
372374
373375 # st.subheader("Display discarded documents by filter")
374376 display_discarded_documents_by_filter = st .checkbox (
@@ -380,37 +382,74 @@ def get_cond(key, cutoff, max_cutoff):
380382
381383 if "number_words" in columns :
382384 cond_filter = np .invert (np .all (conds ["number_words" ], axis = 0 ))
383- Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the number of words" , "docs" )
385+ Visualization .display_dataset (
386+ self .docs ,
387+ cond_filter ,
388+ "Discarded documents for the filter on the number of words" ,
389+ "docs" ,
390+ )
384391
385392 if "repetitions_ratio" in columns :
386393 cond_filter = np .invert (np .all (conds ["repetitions_ratio" ], axis = 0 ))
387- Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the repetitions ratio" , "docs" )
394+ Visualization .display_dataset (
395+ self .docs ,
396+ cond_filter ,
397+ "Discarded documents for the filter on the repetitions ratio" ,
398+ "docs" ,
399+ )
388400
389401 if "special_characters_ratio" in columns :
390402 cond_filter = np .invert (
391403 np .all (conds ["special_characters_ratio" ], axis = 0 )
392404 )
393- Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the special characters ratio" , "docs" )
405+ Visualization .display_dataset (
406+ self .docs ,
407+ cond_filter ,
408+ "Discarded documents for the filter on the special characters ratio" ,
409+ "docs" ,
410+ )
394411
395412 if "stopwords_ratio" in columns :
396413 cond_filter = np .invert (np .all (conds ["stopwords_ratio" ], axis = 0 ))
397- Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the stop words ratio" , "docs" )
414+ Visualization .display_dataset (
415+ self .docs ,
416+ cond_filter ,
417+ "Discarded documents for the filter on the stop words ratio" ,
418+ "docs" ,
419+ )
398420
399421 if "flagged_words_ratio" in columns :
400422 cond_filter = np .invert (
401423 np .all (conds ["flagged_words_ratio" ], axis = 0 )
402424 )
403- Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the flagged words ratio" , "docs" )
425+ Visualization .display_dataset (
426+ self .docs ,
427+ cond_filter ,
428+ "Discarded documents for the filter on the flagged words ratio" ,
429+ "docs" ,
430+ )
404431
405432 if "lang_id_score" in columns :
406433 cond_filter = np .invert (np .all (conds ["lang_id_score" ], axis = 0 ))
407- Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the language identification confidence score" , "docs" )
434+ Visualization .display_dataset (
435+ self .docs ,
436+ cond_filter ,
437+ "Discarded documents for the filter on the language identification confidence score" ,
438+ "docs" ,
439+ )
408440
409441 if "perplexity_score" in columns :
410442 cond_filter = np .invert (np .all (conds ["perplexity_score" ], axis = 0 ))
411- Visualization .display_dataset (self .docs , cond_filter , "Discarded documents for the filter on the perplexity score" , "docs" )
443+ Visualization .display_dataset (
444+ self .docs ,
445+ cond_filter ,
446+ "Discarded documents for the filter on the perplexity score" ,
447+ "docs" ,
448+ )
412449
413- Visualization .display_dataset (self .docs , all_conds , "Retained documents" , "docs" )
450+ Visualization .display_dataset (
451+ self .docs , all_conds , "Retained documents" , "docs"
452+ )
414453
415454 st .header ("Download data" )
416455
@@ -446,22 +485,37 @@ def filtering_of_words(self):
446485 incorrect_substrings = st .checkbox (
447486 "Remove words with incorrect substrings."
448487 )
449- self .parameters .append (("incorrect_substrings" , incorrect_substrings ))
488+ self .parameters .append (
489+ ("incorrect_substrings" , incorrect_substrings )
490+ )
450491
451492 checkbox = st .checkbox (
452- "Diplay distribution" , value = True , key = "display_distribution_incorrect_substrings"
493+ "Diplay distribution" ,
494+ value = True ,
495+ key = "display_distribution_incorrect_substrings" ,
453496 )
454497 if checkbox :
455498 incor_sub = np .array (self .words ["incorrect_substrings" ]) * 1
456499 with_incor_sub = np .sum (incor_sub )
457500 without_incor_sub = len (incor_sub ) - with_incor_sub
458- st .markdown (f"Number of words with incorrect substrings: { with_incor_sub } " )
459- st .markdown (f"Number of words without incorrect substrings: { without_incor_sub } " )
501+ st .markdown (
502+ f"Number of words with incorrect substrings: { with_incor_sub } "
503+ )
504+ st .markdown (
505+ f"Number of words without incorrect substrings: { without_incor_sub } "
506+ )
460507
461508 if incorrect_substrings :
462- cond_incorrect_substrings = np .invert (self .words ["incorrect_substrings" ])
509+ cond_incorrect_substrings = np .invert (
510+ self .words ["incorrect_substrings" ]
511+ )
463512 else :
464- cond_incorrect_substrings = np .array ([True for i in range (len (self .words ["incorrect_substrings" ]))])
513+ cond_incorrect_substrings = np .array (
514+ [
515+ True
516+ for i in range (len (self .words ["incorrect_substrings" ]))
517+ ]
518+ )
465519 Visualization .print_discarded_by_cond (cond_incorrect_substrings )
466520 conds_words ["incorrect_substrings" ] = cond_incorrect_substrings
467521
@@ -479,7 +533,9 @@ def filtering_of_words(self):
479533 f"we consider in this section words for only { self .num_docs_for_words } documents."
480534 )
481535
482- Visualization .display_dataset (self .words , np .invert (all_conds_words ), "Discarded words" , "words" )
536+ Visualization .display_dataset (
537+ self .words , np .invert (all_conds_words ), "Discarded words" , "words"
538+ )
483539
484540 # st.subheader("Display discarded words by filter")
485541 display_discarded_words_by_filter = st .checkbox (
@@ -490,13 +546,25 @@ def filtering_of_words(self):
490546
491547 if "len_word" in columns :
492548 cond_filter = np .invert (conds_words ["len_word" ])
493- Visualization .display_dataset (self .words , cond_filter , "Discarded words for the filter on length" , "words" )
549+ Visualization .display_dataset (
550+ self .words ,
551+ cond_filter ,
552+ "Discarded words for the filter on length" ,
553+ "words" ,
554+ )
494555
495556 if "incorrect_substrings" in columns :
496557 cond_filter = np .invert (conds_words ["incorrect_substrings" ])
497- Visualization .display_dataset (self .words , cond_filter , "Discarded words for the filter on incorrect substrings" , "words" )
558+ Visualization .display_dataset (
559+ self .words ,
560+ cond_filter ,
561+ "Discarded words for the filter on incorrect substrings" ,
562+ "words" ,
563+ )
498564
499- Visualization .display_dataset (self .words , all_conds_words , "Retained words" , "words" )
565+ Visualization .display_dataset (
566+ self .words , all_conds_words , "Retained words" , "words"
567+ )
500568
501569 def download_parameters (self ):
502570 st .sidebar .subheader ("Download parameters" )
0 commit comments