@@ -73,10 +73,6 @@ def warning_preamble(self):
7373 )
7474
7575 def preamble (self ):
76- st .markdown (
77- "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
78- )
79-
8076 def get_binary_file_downloader_html (bin_file , file_label = "File" ):
8177 with open (bin_file , "rb" ) as f :
8278 data = f .read ()
@@ -85,10 +81,11 @@ def get_binary_file_downloader_html(bin_file, file_label="File"):
8581 return href
8682
8783 st .markdown (
84+ "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this " +
8885 get_binary_file_downloader_html (
8986 self .path_instructions ,
90- "Download the explanation of the filtering pipeline as pdf" ,
91- ),
87+ "pdf" ,
88+ ) + "." ,
9289 unsafe_allow_html = True ,
9390 )
9491
@@ -123,15 +120,20 @@ def set_title(self):
123120
124121 @staticmethod
125122 def plot_hist (dataframe , key , num_bins = 50 ):
126- checkbox = st .checkbox ("Diplay distribution" , value = True , key = f"display_distribution_{ key [0 ]} " )
123+ checkbox = st .checkbox (
124+ "Diplay distribution" , value = True , key = f"display_distribution_{ key [0 ]} "
125+ )
127126 if checkbox :
128127 fig , ax = plt .subplots ()
129128 val = dataframe [key [0 ]].values
130129 if np .median (val ) != 0 :
131- val = val [abs (val - np .median (val )) < 9 * np .median (np .absolute (val - np .median (val )))]
130+ val = val [
131+ abs (val - np .median (val ))
132+ < 9 * np .median (np .absolute (val - np .median (val )))
133+ ]
132134 ax .hist (val , bins = num_bins , density = True )
133135 ax .set_title (" " .join (key [0 ].split ("_" )))
134- ax .axvline (x = key [1 ], color = 'r' , linestyle = ' dashed' )
136+ ax .axvline (x = key [1 ], color = "r" , linestyle = " dashed" )
135137 st .pyplot (fig )
136138
137139 def filtering_of_docs (self ):
@@ -281,9 +283,7 @@ def print_discared_by_cond(cond):
281283 with st .sidebar .expander ("Perplexity score" ):
282284 cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
283285 max_pp = int (np .max (self .docs ["perplexity_score" ])) + 1
284- cutoff_perplexity_score = st .slider (
285- cutoff_def , 0 , max_pp , max_pp
286- )
286+ cutoff_perplexity_score = st .slider (cutoff_def , 0 , max_pp , max_pp )
287287 new_key = ("perplexity_score" , cutoff_perplexity_score , True )
288288 keys .append (new_key )
289289 Visualization .plot_hist (self .docs , new_key )
@@ -299,8 +299,12 @@ def print_discared_by_cond(cond):
299299 all_conds = [subcond for cond in list (conds .values ()) for subcond in cond ]
300300 all_conds = np .all (all_conds , axis = 0 )
301301
302- with st .expander (f"Filtering on documents, for { self .num_docs } { self .lang } documents" ):
303- st .header (f"Filtering on documents, for { self .num_docs } { self .lang } documents" )
302+ with st .expander (
303+ f"Filtering on documents, for { self .num_docs } { self .lang } documents"
304+ ):
305+ st .header (
306+ f"Filtering on documents, for { self .num_docs } { self .lang } documents"
307+ )
304308
305309 def display_dataset (cond , description ):
306310 displayed_docs = self .docs .loc [cond ]
@@ -353,7 +357,9 @@ def display_dataset(cond, description):
353357 )
354358
355359 if "flagged_words_ratio" in columns :
356- cond_filter = np .invert (np .all (conds ["flagged_words_ratio" ], axis = 0 ))
360+ cond_filter = np .invert (
361+ np .all (conds ["flagged_words_ratio" ], axis = 0 )
362+ )
357363 display_dataset (
358364 cond_filter ,
359365 "Discarded documents for the filter on the flagged words ratio" ,
@@ -404,10 +410,16 @@ def filtering_of_words(self):
404410
405411 cond_words = self .words ["len_word" ] <= cutoff_word
406412 if incorrect_substrings :
407- cond_words = cond_words & np .invert (self .words ["incorrect_substring" ])
413+ cond_words = cond_words & np .invert (
414+ self .words ["incorrect_substring" ]
415+ )
408416
409- with st .expander (f"Filtering on words, for { self .num_docs } { self .lang } documents" ):
410- st .header (f"Filtering on words, for { self .num_docs } { self .lang } documents" )
417+ with st .expander (
418+ f"Filtering on words, for { self .num_docs } { self .lang } documents"
419+ ):
420+ st .header (
421+ f"Filtering on words, for { self .num_docs } { self .lang } documents"
422+ )
411423
412424 st .markdown (
413425 f"Since the number of words is way larger than the number of documents, "
@@ -514,7 +526,9 @@ def is_doc_discarded(key, score):
514526 )
515527 )
516528 special_characters_ratio = round (special_characters_ratio , 3 )
517- st .markdown (f"Special characters ratio: { special_characters_ratio } " )
529+ st .markdown (
530+ f"Special characters ratio: { special_characters_ratio } "
531+ )
518532 if is_doc_discarded (key , special_characters_ratio ):
519533 is_discarded = True
520534
@@ -581,7 +595,7 @@ def is_doc_discarded(key, score):
581595 )
582596
583597 def visualization (self ):
584- # self.warning_preamble()
598+ self .warning_preamble ()
585599 self .preamble ()
586600 self .open_data ()
587601 self .set_title ()
0 commit comments