@@ -121,6 +121,19 @@ def open_data(self):
121121 def set_title (self ):
122122 st .title (f"{ self .num_docs } { self .lang } documents with their stats." )
123123
124+ @staticmethod
125+ def plot_hist (dataframe , key , num_bins = 50 ):
126+ checkbox = st .sidebar .checkbox ("Diplay distribution" , key = f"display_distribution_{ key [0 ]} " )
127+ if checkbox :
128+ fig , ax = plt .subplots ()
129+ val = dataframe [key [0 ]].values
130+ if np .median (val ) != 0 :
131+ val = val [abs (val - np .median (val )) < 6 * np .median (np .absolute (val - np .median (val )))]
132+ ax .hist (val , bins = num_bins )
133+ ax .set_title (" " .join (key [0 ].split ("_" )))
134+ ax .axvline (x = key [1 ], color = 'r' , linestyle = 'dashed' )
135+ st .sidebar .pyplot (fig )
136+
124137 def filtering_of_docs (self ):
125138 st .sidebar .subheader ("Parameters of the filtering on documents" )
126139
@@ -148,6 +161,7 @@ def print_discared_by_cond(cond):
148161 )
149162 new_key = ("number_words" , cutoff_min_number_words , False )
150163 keys .append (new_key )
164+ Visualization .plot_hist (self .docs , new_key )
151165 cond_1 = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
152166 print_discared_by_cond (cond_1 )
153167
@@ -201,6 +215,7 @@ def print_discared_by_cond(cond):
201215 repetitions_length ,
202216 )
203217 keys .append (new_key )
218+ Visualization .plot_hist (self .docs , new_key )
204219 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
205220 print_discared_by_cond (cond )
206221 conds ["repetitions_ratio" ] = [cond ]
@@ -216,6 +231,7 @@ def print_discared_by_cond(cond):
216231 True ,
217232 )
218233 keys .append (new_key )
234+ Visualization .plot_hist (self .docs , new_key )
219235 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
220236 print_discared_by_cond (cond )
221237 conds ["special_characters_ratio" ] = [cond ]
@@ -227,6 +243,7 @@ def print_discared_by_cond(cond):
227243 )
228244 new_key = ("stopwords_ratio" , cutoff_stopwords_ratio , False )
229245 keys .append (new_key )
246+ Visualization .plot_hist (self .docs , new_key )
230247 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
231248 print_discared_by_cond (cond )
232249 conds ["stopwords_ratio" ] = [cond ]
@@ -238,6 +255,7 @@ def print_discared_by_cond(cond):
238255 )
239256 new_key = ("flagged_words_ratio" , cutoff_flagged_words_ratio , True )
240257 keys .append (new_key )
258+ Visualization .plot_hist (self .docs , new_key )
241259 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
242260 print_discared_by_cond (cond )
243261 conds ["flagged_words_ratio" ] = [cond ]
@@ -249,6 +267,7 @@ def print_discared_by_cond(cond):
249267 )
250268 new_key = ("lang_id_score" , cutoff_lang_id_score , False )
251269 keys .append (new_key )
270+ Visualization .plot_hist (self .docs , new_key )
252271 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
253272 print_discared_by_cond (cond )
254273 conds ["lang_id_score" ] = [cond ]
@@ -261,6 +280,7 @@ def print_discared_by_cond(cond):
261280 )
262281 new_key = ("perplexity_score" , cutoff_perplexity_score , True )
263282 keys .append (new_key )
283+ Visualization .plot_hist (self .docs , new_key )
264284 cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
265285 print_discared_by_cond (cond )
266286 conds ["perplexity_score" ] = [cond ]
@@ -355,7 +375,9 @@ def filtering_of_words(self):
355375 cutoff_def = "If the length of a word is higher than this number, the word is removed."
356376 max_len_word = min (int (np .max (self .words ["len_word" ])) + 1 , 200 )
357377 cutoff_word = st .sidebar .slider (cutoff_def , 0 , max_len_word , max_len_word )
358- self .parameters .append (("len_word" , cutoff_word , True ))
378+ new_key = ("len_word" , cutoff_word , True )
379+ self .parameters .append (new_key )
380+ Visualization .plot_hist (self .words , new_key )
359381 st .sidebar .caption ("---------" )
360382
361383 incorrect_substrings = st .sidebar .checkbox (
@@ -400,29 +422,6 @@ def download_parameters(self):
400422 file_name = f"parameters_{ self .lang_dataset_id } .json" ,
401423 )
402424
403- def plot_distributions_filtering_parameters (self ):
404- st .header ("Distributions of the filtering parameters" )
405-
406- display_distributions = st .checkbox ("Display distributions" )
407-
408- if display_distributions :
409-
410- def plot_hist (dataframe , key , num_bins = 50 ):
411- st .subheader (" " .join (key .split ("_" )))
412- hist_values = dataframe [key ].values
413- max_range = np .max (hist_values )
414- hist_values = np .histogram (
415- hist_values , bins = num_bins , range = (0 , max_range )
416- )[0 ]
417- st .bar_chart (hist_values )
418- st .markdown (f"Each bin is of size: { max_range / num_bins } ." )
419-
420- for key in list ({el [0 ]: None for el in self .keys }):
421- plot_hist (self .docs , key )
422-
423- if not (self .words is None ):
424- plot_hist (self .words , "len_word" )
425-
426425 def plot_zipf_law (self ):
427426 if not (self .words is None ):
428427 st .header ("Zipf's Law" )
@@ -578,7 +577,6 @@ def visualization(self):
578577 self .filtering_of_docs ()
579578 self .filtering_of_words ()
580579 self .download_parameters ()
581- self .plot_distributions_filtering_parameters ()
582580 self .plot_zipf_law ()
583581 self .analyse_personal_doc ()
584582 self .download_data ()
0 commit comments