44
55import os
66
7+ from io import StringIO
78import base64
89import json
910import pandas as pd
@@ -201,7 +202,7 @@ def print_discared_by_cond(cond):
201202 "few or no repetitions, simply because their length gives them more diversity, and we do "
202203 "not want to discard such documents."
203204 )
204- self .docs = self .docs_checkpoint
205+ self .docs [ "repetitions_ratio" ] = self .docs_checkpoint [ "repetitions_ratio" ]
205206 for i in range (len (self .docs ["repetitions_ratio" ])):
206207 self .docs ["repetitions_ratio" ].iloc [i ] = self .docs [
207208 "repetitions_ratio"
@@ -242,6 +243,21 @@ def print_discared_by_cond(cond):
242243
243244 if "stopwords_ratio" in columns :
244245 with st .sidebar .expander ("Stop words ratio" ):
246+ stopwords_file = st .file_uploader ("Upload your own list of stop words (one per line). If there is none, the default one is used." )
247+ if stopwords_file :
248+ new_stopwords = StringIO (stopwords_file .getvalue ().decode ("utf-8" )).read ()
249+ new_stopwords = set (new_stopwords .split ("\n " ))
250+ self .docs ["stopwords_ratio" ] = self .docs_checkpoint ["stopwords_ratio" ]
251+ for i in range (len (self .docs ["stopwords_ratio" ])):
252+ self .docs ["stopwords_ratio" ].iloc [i ] = Filtering .compute_stopwords_ratio (
253+ self .docs ["text" ].iloc [i ],
254+ self .sentencepiece_model_tok ,
255+ self .param ["strip_characters" ],
256+ self .param ["cond_words_augmentation" ],
257+ self .param ["words_augmentation_group_sizes" ],
258+ self .param ["words_augmentation_join_char" ],
259+ new_stopwords ,
260+ )
245261 cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
246262 cutoff_stopwords_ratio = st .slider (
247263 cutoff_def , 0.0 , 1.0 , 0.0 , step = 0.01
@@ -255,6 +271,21 @@ def print_discared_by_cond(cond):
255271
256272 if "flagged_words_ratio" in columns :
257273 with st .sidebar .expander ("Flagged words ratio" ):
274+ flagged_words_file = st .file_uploader ("Upload your own list of flagged words (one per line). If there is none, the default one is used." )
275+ if flagged_words_file :
276+ new_flagged_words = StringIO (flagged_words_file .getvalue ().decode ("utf-8" )).read ()
277+ new_flagged_words = set (new_flagged_words .split ("\n " ))
278+ self .docs ["flagged_words_ratio" ] = self .docs_checkpoint ["flagged_words_ratio" ]
279+ for i in range (len (self .docs ["flagged_words_ratio" ])):
280+ self .docs ["flagged_words_ratio" ].iloc [i ] = Filtering .compute_flagged_words_ratio (
281+ self .docs ["text" ].iloc [i ],
282+ self .sentencepiece_model_tok ,
283+ self .param ["strip_characters" ],
284+ self .param ["cond_words_augmentation" ],
285+ self .param ["words_augmentation_group_sizes" ],
286+ self .param ["words_augmentation_join_char" ],
287+ new_flagged_words ,
288+ )
258289 cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
259290 cutoff_flagged_words_ratio = st .slider (
260291 cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
0 commit comments