77import base64
88import json
99import pandas as pd
10+ pd .options .mode .chained_assignment = None
1011
1112import numpy as np
1213
@@ -73,16 +74,17 @@ def open_data(self):
7374 doc ["text" ][: self .max_len_text_display ]
7475 + " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
7576 )
76- self .docs = pd .DataFrame (docs )
77+ self .docs_checkpoint = pd .DataFrame (docs )
78+ self .docs = self .docs_checkpoint
7779
7880 def set_title (self ):
7981 st .title (f"{ self .num_docs } { self .lang } documents with their stats." )
8082
8183 def filtering_of_docs (self ):
8284 st .sidebar .subheader ("Parameters of the filtering on documents" )
8385
84- def set_sliders (docs ):
85- columns = list (docs )
86+ def set_sliders ():
87+ columns = list (self . docs )
8688 keys = []
8789 conds = {}
8890
@@ -99,7 +101,7 @@ def print_discared_by_cond(cond):
99101
100102 if "number_words" in columns :
101103 cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
102- max_nb_words = int (np .max (docs ["number_words" ])) + 1
104+ max_nb_words = int (np .max (self . docs ["number_words" ])) + 1
103105 cutoff_min_number_words = st .sidebar .slider (
104106 cutoff_def , 0 , min (max_nb_words , 500 ), 0
105107 )
@@ -119,6 +121,46 @@ def print_discared_by_cond(cond):
119121
120122 conds ["number_words" ] = [cond_1 , cond_2 ]
121123
124+ if "repetitions_ratio" in columns :
125+ val_repetitions_lengths = list (
126+ self .docs ["repetitions_ratio" ].iloc [0 ].keys ()
127+ )
128+ default_index = (
129+ val_repetitions_lengths .index ("10" )
130+ if "10" in val_repetitions_lengths
131+ else 0
132+ )
133+ label_selectbox = (
134+ "Length of the repetitions (that will determine the repetitions ratio). "
135+ "Choosing a higher or lower number does not mean that the filtering "
136+ "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
137+ "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
138+ "few or no repetitions, simply because their length gives them more diversity, and we do "
139+ "not want to discard such documents."
140+ )
141+ repetitions_length = st .sidebar .selectbox (
142+ label = label_selectbox ,
143+ options = val_repetitions_lengths ,
144+ index = default_index ,
145+ )
146+ self .docs = self .docs_checkpoint
147+ for i in range (len (self .docs ["repetitions_ratio" ])):
148+ self .docs ["repetitions_ratio" ].iloc [i ] = self .docs ["repetitions_ratio" ].iloc [i ][repetitions_length ]
149+
150+ cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
151+ cutoff_repetitions_ratio = st .sidebar .slider (
152+ cutoff_def , 0.0 , 1.0 , 1.0 , step = 0.01
153+ )
154+ new_key = (
155+ "repetitions_ratio" ,
156+ cutoff_repetitions_ratio ,
157+ True ,
158+ )
159+ keys .append (new_key )
160+ cond = get_cond (new_key [0 ], new_key [1 ], new_key [2 ])
161+ print_discared_by_cond (cond )
162+ conds ["repetitions_ratio" ] = [cond ]
163+
122164 if "special_characters_ratio" in columns :
123165 cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
124166 cutoff_special_characters_ratio = st .sidebar .slider (
@@ -169,7 +211,7 @@ def print_discared_by_cond(cond):
169211
170212 if "perplexity_score" in columns :
171213 cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
172- max_pp = int (np .max (docs ["perplexity_score" ])) + 1
214+ max_pp = int (np .max (self . docs ["perplexity_score" ])) + 1
173215 cutoff_perplexity_score = st .sidebar .slider (
174216 cutoff_def , 0 , max_pp , max_pp
175217 )
@@ -181,7 +223,7 @@ def print_discared_by_cond(cond):
181223
182224 return keys , conds
183225
184- self .keys , conds = set_sliders (self . docs )
226+ self .keys , conds = set_sliders ()
185227
186228 all_conds = [subcond for cond in list (conds .values ()) for subcond in cond ]
187229 all_conds = np .all (all_conds , axis = 0 )
@@ -215,6 +257,13 @@ def display_dataset(cond, description):
215257 "Discarded documents for the filter on the number of words" ,
216258 )
217259
260+ if "repetitions_ratio" in columns :
261+ cond_filter = np .invert (np .all (conds ["repetitions_ratio" ], axis = 0 ))
262+ display_dataset (
263+ cond_filter ,
264+ "Discarded documents for the filter on the repetitions ratio" ,
265+ )
266+
218267 if "special_characters_ratio" in columns :
219268 cond_filter = np .invert (
220269 np .all (conds ["special_characters_ratio" ], axis = 0 )
0 commit comments