@@ -82,11 +82,12 @@ def get_binary_file_downloader_html(bin_file, file_label="File"):
8282 return href
8383
8484 st .markdown (
85- "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this " +
86- get_binary_file_downloader_html (
85+ "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this "
86+ + get_binary_file_downloader_html (
8787 self .path_instructions ,
8888 "pdf" ,
89- ) + "." ,
89+ )
90+ + "." ,
9091 unsafe_allow_html = True ,
9192 )
9293
@@ -202,7 +203,9 @@ def print_discared_by_cond(cond):
202203 "few or no repetitions, simply because their length gives them more diversity, and we do "
203204 "not want to discard such documents."
204205 )
205- self .docs ["repetitions_ratio" ] = self .docs_checkpoint ["repetitions_ratio" ]
206+ self .docs ["repetitions_ratio" ] = self .docs_checkpoint [
207+ "repetitions_ratio"
208+ ]
206209 for i in range (len (self .docs ["repetitions_ratio" ])):
207210 self .docs ["repetitions_ratio" ].iloc [i ] = self .docs [
208211 "repetitions_ratio"
@@ -243,13 +246,21 @@ def print_discared_by_cond(cond):
243246
244247 if "stopwords_ratio" in columns :
245248 with st .sidebar .expander ("Stop words ratio" ):
246- stopwords_file = st .file_uploader ("Upload your own list of stop words (one per line). If there is none, the default one is used." )
249+ stopwords_file = st .file_uploader (
250+ "Upload your own list of stop words (one per line). If there is none, the default one is used."
251+ )
247252 if stopwords_file :
248- new_stopwords = StringIO (stopwords_file .getvalue ().decode ("utf-8" )).read ()
253+ new_stopwords = StringIO (
254+ stopwords_file .getvalue ().decode ("utf-8" )
255+ ).read ()
249256 new_stopwords = set (new_stopwords .split ("\n " ))
250- self .docs ["stopwords_ratio" ] = self .docs_checkpoint ["stopwords_ratio" ]
257+ self .docs ["stopwords_ratio" ] = self .docs_checkpoint [
258+ "stopwords_ratio"
259+ ]
251260 for i in range (len (self .docs ["stopwords_ratio" ])):
252- self .docs ["stopwords_ratio" ].iloc [i ] = Filtering .compute_stopwords_ratio (
261+ self .docs ["stopwords_ratio" ].iloc [
262+ i
263+ ] = Filtering .compute_stopwords_ratio (
253264 self .docs ["text" ].iloc [i ],
254265 self .sentencepiece_model_tok ,
255266 self .param ["strip_characters" ],
@@ -271,13 +282,21 @@ def print_discared_by_cond(cond):
271282
272283 if "flagged_words_ratio" in columns :
273284 with st .sidebar .expander ("Flagged words ratio" ):
274- flagged_words_file = st .file_uploader ("Upload your own list of flagged words (one per line). If there is none, the default one is used." )
285+ flagged_words_file = st .file_uploader (
286+ "Upload your own list of flagged words (one per line). If there is none, the default one is used."
287+ )
275288 if flagged_words_file :
276- new_flagged_words = StringIO (flagged_words_file .getvalue ().decode ("utf-8" )).read ()
289+ new_flagged_words = StringIO (
290+ flagged_words_file .getvalue ().decode ("utf-8" )
291+ ).read ()
277292 new_flagged_words = set (new_flagged_words .split ("\n " ))
278- self .docs ["flagged_words_ratio" ] = self .docs_checkpoint ["flagged_words_ratio" ]
293+ self .docs ["flagged_words_ratio" ] = self .docs_checkpoint [
294+ "flagged_words_ratio"
295+ ]
279296 for i in range (len (self .docs ["flagged_words_ratio" ])):
280- self .docs ["flagged_words_ratio" ].iloc [i ] = Filtering .compute_flagged_words_ratio (
297+ self .docs ["flagged_words_ratio" ].iloc [
298+ i
299+ ] = Filtering .compute_flagged_words_ratio (
281300 self .docs ["text" ].iloc [i ],
282301 self .sentencepiece_model_tok ,
283302 self .param ["strip_characters" ],
@@ -626,7 +645,7 @@ def is_doc_discarded(key, score):
626645 )
627646
628647 def visualization (self ):
629- self .warning_preamble ()
648+ # self.warning_preamble()
630649 self .preamble ()
631650 self .open_data ()
632651 self .set_title ()
0 commit comments