Skip to content

Commit 810522b

Browse files
committed
visualization: parameters are in expanders
1 parent 9174c6b commit 810522b

1 file changed

Lines changed: 146 additions & 139 deletions

File tree

ac_dc/visualization/visualization.py

Lines changed: 146 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -123,16 +123,16 @@ def set_title(self):
123123

124124
@staticmethod
125125
def plot_hist(dataframe, key, num_bins=50):
126-
checkbox = st.sidebar.checkbox("Diplay distribution", key=f"display_distribution_{key[0]}")
126+
checkbox = st.checkbox("Diplay distribution", value=True, key=f"display_distribution_{key[0]}")
127127
if checkbox:
128128
fig, ax = plt.subplots()
129129
val = dataframe[key[0]].values
130130
if np.median(val) != 0:
131-
val = val[abs(val - np.median(val)) < 6 * np.median(np.absolute(val - np.median(val)))]
132-
ax.hist(val, bins=num_bins)
131+
val = val[abs(val - np.median(val)) < 9 * np.median(np.absolute(val - np.median(val)))]
132+
ax.hist(val, bins=num_bins, density=True)
133133
ax.set_title(" ".join(key[0].split("_")))
134134
ax.axvline(x=key[1], color='r', linestyle='dashed')
135-
st.sidebar.pyplot(fig)
135+
st.pyplot(fig)
136136

137137
def filtering_of_docs(self):
138138
st.sidebar.subheader("Parameters of the filtering on documents")
@@ -148,142 +148,148 @@ def get_cond(key, cutoff, max_cutoff):
148148
return self.docs[key] >= cutoff
149149

150150
def print_discared_by_cond(cond):
151-
st.sidebar.caption(
151+
st.caption(
152152
f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
153153
)
154-
st.sidebar.caption("---------")
155154

156155
if "number_words" in columns:
157-
cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
158-
max_nb_words = int(np.max(self.docs["number_words"])) + 1
159-
cutoff_min_number_words = st.sidebar.slider(
160-
cutoff_def, 0, min(max_nb_words, 500), 0
161-
)
162-
new_key = ("number_words", cutoff_min_number_words, False)
163-
keys.append(new_key)
164-
Visualization.plot_hist(self.docs, new_key)
165-
cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
166-
print_discared_by_cond(cond_1)
167-
168-
cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
169-
cutoff_max_number_words = st.sidebar.slider(
170-
cutoff_def, 0, max_nb_words, max_nb_words
171-
)
172-
new_key = ("number_words", cutoff_max_number_words, True)
173-
keys.append(new_key)
174-
cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
175-
print_discared_by_cond(cond_2)
156+
with st.sidebar.expander("Number of words"):
157+
cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
158+
max_nb_words = int(np.max(self.docs["number_words"])) + 1
159+
cutoff_min_number_words = st.slider(
160+
cutoff_def, 0, min(max_nb_words, 500), 0
161+
)
162+
new_key = ("number_words", cutoff_min_number_words, False)
163+
keys.append(new_key)
164+
Visualization.plot_hist(self.docs, new_key)
165+
cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
166+
print_discared_by_cond(cond_1)
167+
168+
cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
169+
cutoff_max_number_words = st.slider(
170+
cutoff_def, 0, max_nb_words, max_nb_words
171+
)
172+
new_key = ("number_words", cutoff_max_number_words, True)
173+
keys.append(new_key)
174+
cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
175+
print_discared_by_cond(cond_2)
176176

177-
conds["number_words"] = [cond_1, cond_2]
177+
conds["number_words"] = [cond_1, cond_2]
178178

179179
if "repetitions_ratio" in columns:
180-
val_repetitions_lengths = list(
181-
self.docs["repetitions_ratio"].iloc[0].keys()
182-
)
183-
default_index = (
184-
val_repetitions_lengths.index("10")
185-
if "10" in val_repetitions_lengths
186-
else 0
187-
)
188-
label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
189-
repetitions_length = st.sidebar.selectbox(
190-
label=label_selectbox,
191-
options=val_repetitions_lengths,
192-
index=default_index,
193-
)
194-
st.sidebar.caption(
195-
"Choosing a higher or lower number does not mean that the filtering "
196-
"is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
197-
"tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
198-
"few or no repetitions, simply because their length gives them more diversity, and we do "
199-
"not want to discard such documents."
200-
)
201-
self.docs = self.docs_checkpoint
202-
for i in range(len(self.docs["repetitions_ratio"])):
203-
self.docs["repetitions_ratio"].iloc[i] = self.docs[
204-
"repetitions_ratio"
205-
].iloc[i][repetitions_length]
206-
207-
cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
208-
cutoff_repetitions_ratio = st.sidebar.slider(
209-
cutoff_def, 0.0, 1.0, 1.0, step=0.01
210-
)
211-
new_key = (
212-
"repetitions_ratio",
213-
cutoff_repetitions_ratio,
214-
True,
215-
repetitions_length,
216-
)
217-
keys.append(new_key)
218-
Visualization.plot_hist(self.docs, new_key)
219-
cond = get_cond(new_key[0], new_key[1], new_key[2])
220-
print_discared_by_cond(cond)
221-
conds["repetitions_ratio"] = [cond]
180+
with st.sidebar.expander("Repetitions ratio"):
181+
val_repetitions_lengths = list(
182+
self.docs["repetitions_ratio"].iloc[0].keys()
183+
)
184+
default_index = (
185+
val_repetitions_lengths.index("10")
186+
if "10" in val_repetitions_lengths
187+
else 0
188+
)
189+
label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
190+
repetitions_length = st.selectbox(
191+
label=label_selectbox,
192+
options=val_repetitions_lengths,
193+
index=default_index,
194+
)
195+
st.caption(
196+
"Choosing a higher or lower number does not mean that the filtering "
197+
"is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
198+
"tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
199+
"few or no repetitions, simply because their length gives them more diversity, and we do "
200+
"not want to discard such documents."
201+
)
202+
self.docs = self.docs_checkpoint
203+
for i in range(len(self.docs["repetitions_ratio"])):
204+
self.docs["repetitions_ratio"].iloc[i] = self.docs[
205+
"repetitions_ratio"
206+
].iloc[i][repetitions_length]
207+
208+
cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
209+
cutoff_repetitions_ratio = st.slider(
210+
cutoff_def, 0.0, 1.0, 1.0, step=0.01
211+
)
212+
new_key = (
213+
"repetitions_ratio",
214+
cutoff_repetitions_ratio,
215+
True,
216+
repetitions_length,
217+
)
218+
keys.append(new_key)
219+
Visualization.plot_hist(self.docs, new_key)
220+
cond = get_cond(new_key[0], new_key[1], new_key[2])
221+
print_discared_by_cond(cond)
222+
conds["repetitions_ratio"] = [cond]
222223

223224
if "special_characters_ratio" in columns:
224-
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
225-
cutoff_special_characters_ratio = st.sidebar.slider(
226-
cutoff_def, 0.0, 1.0, 1.0, step=0.01
227-
)
228-
new_key = (
229-
"special_characters_ratio",
230-
cutoff_special_characters_ratio,
231-
True,
232-
)
233-
keys.append(new_key)
234-
Visualization.plot_hist(self.docs, new_key)
235-
cond = get_cond(new_key[0], new_key[1], new_key[2])
236-
print_discared_by_cond(cond)
237-
conds["special_characters_ratio"] = [cond]
225+
with st.sidebar.expander("Special characters ratio"):
226+
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
227+
cutoff_special_characters_ratio = st.slider(
228+
cutoff_def, 0.0, 1.0, 1.0, step=0.01
229+
)
230+
new_key = (
231+
"special_characters_ratio",
232+
cutoff_special_characters_ratio,
233+
True,
234+
)
235+
keys.append(new_key)
236+
Visualization.plot_hist(self.docs, new_key)
237+
cond = get_cond(new_key[0], new_key[1], new_key[2])
238+
print_discared_by_cond(cond)
239+
conds["special_characters_ratio"] = [cond]
238240

239241
if "stopwords_ratio" in columns:
240-
cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
241-
cutoff_stopwords_ratio = st.sidebar.slider(
242-
cutoff_def, 0.0, 1.0, 0.0, step=0.01
243-
)
244-
new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
245-
keys.append(new_key)
246-
Visualization.plot_hist(self.docs, new_key)
247-
cond = get_cond(new_key[0], new_key[1], new_key[2])
248-
print_discared_by_cond(cond)
249-
conds["stopwords_ratio"] = [cond]
242+
with st.sidebar.expander("Stop words ratio"):
243+
cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
244+
cutoff_stopwords_ratio = st.slider(
245+
cutoff_def, 0.0, 1.0, 0.0, step=0.01
246+
)
247+
new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
248+
keys.append(new_key)
249+
Visualization.plot_hist(self.docs, new_key)
250+
cond = get_cond(new_key[0], new_key[1], new_key[2])
251+
print_discared_by_cond(cond)
252+
conds["stopwords_ratio"] = [cond]
250253

251254
if "flagged_words_ratio" in columns:
252-
cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
253-
cutoff_flagged_words_ratio = st.sidebar.slider(
254-
cutoff_def, 0.0, 1.0, 1.0, step=0.01
255-
)
256-
new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
257-
keys.append(new_key)
258-
Visualization.plot_hist(self.docs, new_key)
259-
cond = get_cond(new_key[0], new_key[1], new_key[2])
260-
print_discared_by_cond(cond)
261-
conds["flagged_words_ratio"] = [cond]
255+
with st.sidebar.expander("Flagged words ratio"):
256+
cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
257+
cutoff_flagged_words_ratio = st.slider(
258+
cutoff_def, 0.0, 1.0, 1.0, step=0.01
259+
)
260+
new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
261+
keys.append(new_key)
262+
Visualization.plot_hist(self.docs, new_key)
263+
cond = get_cond(new_key[0], new_key[1], new_key[2])
264+
print_discared_by_cond(cond)
265+
conds["flagged_words_ratio"] = [cond]
262266

263267
if "lang_id_score" in columns:
264-
cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
265-
cutoff_lang_id_score = st.sidebar.slider(
266-
cutoff_def, 0.0, 1.0, 0.0, step=0.01
267-
)
268-
new_key = ("lang_id_score", cutoff_lang_id_score, False)
269-
keys.append(new_key)
270-
Visualization.plot_hist(self.docs, new_key)
271-
cond = get_cond(new_key[0], new_key[1], new_key[2])
272-
print_discared_by_cond(cond)
273-
conds["lang_id_score"] = [cond]
268+
with st.sidebar.expander("Language ID confidence score"):
269+
cutoff_def = "If the confidence score for the language identification prediction of a document is lower than this number, the document is removed."
270+
cutoff_lang_id_score = st.slider(
271+
cutoff_def, 0.0, 1.0, 0.0, step=0.01
272+
)
273+
new_key = ("lang_id_score", cutoff_lang_id_score, False)
274+
keys.append(new_key)
275+
Visualization.plot_hist(self.docs, new_key)
276+
cond = get_cond(new_key[0], new_key[1], new_key[2])
277+
print_discared_by_cond(cond)
278+
conds["lang_id_score"] = [cond]
274279

275280
if "perplexity_score" in columns:
276-
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
277-
max_pp = int(np.max(self.docs["perplexity_score"])) + 1
278-
cutoff_perplexity_score = st.sidebar.slider(
279-
cutoff_def, 0, max_pp, max_pp
280-
)
281-
new_key = ("perplexity_score", cutoff_perplexity_score, True)
282-
keys.append(new_key)
283-
Visualization.plot_hist(self.docs, new_key)
284-
cond = get_cond(new_key[0], new_key[1], new_key[2])
285-
print_discared_by_cond(cond)
286-
conds["perplexity_score"] = [cond]
281+
with st.sidebar.expander("Perplexity score"):
282+
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
283+
max_pp = int(np.max(self.docs["perplexity_score"])) + 1
284+
cutoff_perplexity_score = st.slider(
285+
cutoff_def, 0, max_pp, max_pp
286+
)
287+
new_key = ("perplexity_score", cutoff_perplexity_score, True)
288+
keys.append(new_key)
289+
Visualization.plot_hist(self.docs, new_key)
290+
cond = get_cond(new_key[0], new_key[1], new_key[2])
291+
print_discared_by_cond(cond)
292+
conds["perplexity_score"] = [cond]
287293

288294
return keys, conds
289295

@@ -372,23 +378,23 @@ def filtering_of_words(self):
372378
if not (self.words is None):
373379
st.sidebar.subheader("Parameter of the filtering on words")
374380

375-
cutoff_def = "If the length of a word is higher than this number, the word is removed."
376-
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
377-
cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
378-
new_key = ("len_word", cutoff_word, True)
379-
self.parameters.append(new_key)
380-
Visualization.plot_hist(self.words, new_key)
381-
st.sidebar.caption("---------")
382-
383-
incorrect_substrings = st.sidebar.checkbox(
384-
"Remove words with incorrect substrings."
385-
)
386-
self.parameters.append(("incorrect_substrings", incorrect_substrings))
387-
st.sidebar.caption("---------")
381+
with st.sidebar.expander("Length of words"):
382+
cutoff_def = "If the length of a word is higher than this number, the word is removed."
383+
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
384+
cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
385+
new_key = ("len_word", cutoff_word, True)
386+
self.parameters.append(new_key)
387+
Visualization.plot_hist(self.words, new_key)
388+
389+
with st.sidebar.expander("Words with incorrect substrings"):
390+
incorrect_substrings = st.checkbox(
391+
"Remove words with incorrect substrings."
392+
)
393+
self.parameters.append(("incorrect_substrings", incorrect_substrings))
388394

389-
cond_words = self.words["len_word"] <= cutoff_word
390-
if incorrect_substrings:
391-
cond_words = cond_words & np.invert(self.words["incorrect_substring"])
395+
cond_words = self.words["len_word"] <= cutoff_word
396+
if incorrect_substrings:
397+
cond_words = cond_words & np.invert(self.words["incorrect_substring"])
392398

393399
st.header("Filtering on words")
394400

@@ -416,6 +422,7 @@ def filtering_of_words(self):
416422
st.dataframe(retained_words)
417423

418424
def download_parameters(self):
425+
st.sidebar.subheader("Download parameters")
419426
btn = st.sidebar.download_button(
420427
label="Download current parameters as json",
421428
data=json.dumps(self.parameters),

0 commit comments

Comments
 (0)