Skip to content

Commit cd5f548

Browse files
committed
visualization: percentage and examples of discarded words by filter for the filtering on words
1 parent 66a1978 commit cd5f548

2 files changed

Lines changed: 84 additions & 90 deletions

File tree

ac_dc/visualization/get_data_for_visualization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def compute_stats(self):
7171
words = [
7272
{
7373
"len_word": len(word),
74-
"incorrect_substring": any(
74+
"incorrect_substrings": any(
7575
[
7676
(i_substr in word)
7777
for i_substr in self.param["incorrect_word_substrings"]

ac_dc/visualization/visualization.py

Lines changed: 83 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,12 @@ def open_data(self):
120120
def set_title(self):
121121
st.title(f"Filtering visualization")
122122

123+
@staticmethod
124+
def print_discarded_by_cond(cond):
125+
st.caption(
126+
f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
127+
)
128+
123129
@staticmethod
124130
def plot_hist(dataframe, key, num_bins=50):
125131
checkbox = st.checkbox(
@@ -138,6 +144,17 @@ def plot_hist(dataframe, key, num_bins=50):
138144
ax.axvline(x=key[1], color="r", linestyle="dashed")
139145
st.pyplot(fig)
140146

147+
@staticmethod
148+
def display_dataset(dataframe, cond, description, type_of_examples):
149+
displayed_examples = dataframe.loc[cond]
150+
st.subheader(
151+
f"{description}: {len(displayed_examples)} {type_of_examples} ({len(displayed_examples) / len(dataframe.index) * 100:.2f}%)"
152+
)
153+
st.markdown(
154+
"Click on a column to sort by it, place the cursor on the text to display it."
155+
)
156+
st.dataframe(displayed_examples)
157+
141158
def filtering_of_docs(self):
142159
st.sidebar.subheader("Parameters of the filtering on documents")
143160

@@ -151,11 +168,6 @@ def get_cond(key, cutoff, max_cutoff):
151168
return self.docs[key] <= cutoff
152169
return self.docs[key] >= cutoff
153170

154-
def print_discared_by_cond(cond):
155-
st.caption(
156-
f"{(len(cond) - np.sum(1*cond)) / len(cond) * 100:.2f}% of the total is discarded with this filter."
157-
)
158-
159171
if "number_words" in columns:
160172
with st.sidebar.expander("Number of words"):
161173
cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
@@ -167,7 +179,7 @@ def print_discared_by_cond(cond):
167179
keys.append(new_key)
168180
Visualization.plot_hist(self.docs, new_key)
169181
cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
170-
print_discared_by_cond(cond_1)
182+
Visualization.print_discarded_by_cond(cond_1)
171183

172184
cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
173185
cutoff_max_number_words = st.slider(
@@ -176,7 +188,7 @@ def print_discared_by_cond(cond):
176188
new_key = ("number_words", cutoff_max_number_words, True)
177189
keys.append(new_key)
178190
cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
179-
print_discared_by_cond(cond_2)
191+
Visualization.print_discarded_by_cond(cond_2)
180192

181193
conds["number_words"] = [cond_1, cond_2]
182194

@@ -224,7 +236,7 @@ def print_discared_by_cond(cond):
224236
keys.append(new_key)
225237
Visualization.plot_hist(self.docs, new_key)
226238
cond = get_cond(new_key[0], new_key[1], new_key[2])
227-
print_discared_by_cond(cond)
239+
Visualization.print_discarded_by_cond(cond)
228240
conds["repetitions_ratio"] = [cond]
229241

230242
if "special_characters_ratio" in columns:
@@ -241,7 +253,7 @@ def print_discared_by_cond(cond):
241253
keys.append(new_key)
242254
Visualization.plot_hist(self.docs, new_key)
243255
cond = get_cond(new_key[0], new_key[1], new_key[2])
244-
print_discared_by_cond(cond)
256+
Visualization.print_discarded_by_cond(cond)
245257
conds["special_characters_ratio"] = [cond]
246258

247259
if "stopwords_ratio" in columns:
@@ -277,7 +289,7 @@ def print_discared_by_cond(cond):
277289
keys.append(new_key)
278290
Visualization.plot_hist(self.docs, new_key)
279291
cond = get_cond(new_key[0], new_key[1], new_key[2])
280-
print_discared_by_cond(cond)
292+
Visualization.print_discarded_by_cond(cond)
281293
conds["stopwords_ratio"] = [cond]
282294

283295
if "flagged_words_ratio" in columns:
@@ -313,7 +325,7 @@ def print_discared_by_cond(cond):
313325
keys.append(new_key)
314326
Visualization.plot_hist(self.docs, new_key)
315327
cond = get_cond(new_key[0], new_key[1], new_key[2])
316-
print_discared_by_cond(cond)
328+
Visualization.print_discarded_by_cond(cond)
317329
conds["flagged_words_ratio"] = [cond]
318330

319331
if "lang_id_score" in columns:
@@ -326,7 +338,7 @@ def print_discared_by_cond(cond):
326338
keys.append(new_key)
327339
Visualization.plot_hist(self.docs, new_key)
328340
cond = get_cond(new_key[0], new_key[1], new_key[2])
329-
print_discared_by_cond(cond)
341+
Visualization.print_discarded_by_cond(cond)
330342
conds["lang_id_score"] = [cond]
331343

332344
if "perplexity_score" in columns:
@@ -338,7 +350,7 @@ def print_discared_by_cond(cond):
338350
keys.append(new_key)
339351
Visualization.plot_hist(self.docs, new_key)
340352
cond = get_cond(new_key[0], new_key[1], new_key[2])
341-
print_discared_by_cond(cond)
353+
Visualization.print_discarded_by_cond(cond)
342354
conds["perplexity_score"] = [cond]
343355

344356
return keys, conds
@@ -356,17 +368,7 @@ def print_discared_by_cond(cond):
356368
f"Filtering on documents, for {self.num_docs} {self.lang} documents"
357369
)
358370

359-
def display_dataset(cond, description):
360-
displayed_docs = self.docs.loc[cond]
361-
st.subheader(
362-
f"{description}: {len(displayed_docs)} docs ({len(displayed_docs) / self.num_docs * 100:.2f}%)"
363-
)
364-
st.markdown(
365-
"Click on a column to sort by it, place the cursor on the text to display it."
366-
)
367-
st.dataframe(displayed_docs)
368-
369-
display_dataset(np.invert(all_conds), "Discarded documents")
371+
Visualization.display_dataset(self.docs, np.invert(all_conds), "Discarded documents", "docs")
370372

371373
# st.subheader("Display discarded documents by filter")
372374
display_discarded_documents_by_filter = st.checkbox(
@@ -378,58 +380,37 @@ def display_dataset(cond, description):
378380

379381
if "number_words" in columns:
380382
cond_filter = np.invert(np.all(conds["number_words"], axis=0))
381-
display_dataset(
382-
cond_filter,
383-
"Discarded documents for the filter on the number of words",
384-
)
383+
Visualization.display_dataset(self.docs, cond_filter, "Discarded documents for the filter on the number of words", "docs")
385384

386385
if "repetitions_ratio" in columns:
387386
cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
388-
display_dataset(
389-
cond_filter,
390-
"Discarded documents for the filter on the repetitions ratio",
391-
)
387+
Visualization.display_dataset(self.docs, cond_filter, "Discarded documents for the filter on the repetitions ratio", "docs")
392388

393389
if "special_characters_ratio" in columns:
394390
cond_filter = np.invert(
395391
np.all(conds["special_characters_ratio"], axis=0)
396392
)
397-
display_dataset(
398-
cond_filter,
399-
"Discarded documents for the filter on the special characters ratio",
400-
)
393+
Visualization.display_dataset(self.docs, cond_filter, "Discarded documents for the filter on the special characters ratio", "docs")
401394

402395
if "stopwords_ratio" in columns:
403396
cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
404-
display_dataset(
405-
cond_filter,
406-
"Discarded documents for the filter on the stop words ratio",
407-
)
397+
Visualization.display_dataset(self.docs, cond_filter, "Discarded documents for the filter on the stop words ratio", "docs")
408398

409399
if "flagged_words_ratio" in columns:
410400
cond_filter = np.invert(
411401
np.all(conds["flagged_words_ratio"], axis=0)
412402
)
413-
display_dataset(
414-
cond_filter,
415-
"Discarded documents for the filter on the flagged words ratio",
416-
)
403+
Visualization.display_dataset(self.docs, cond_filter, "Discarded documents for the filter on the flagged words ratio", "docs")
417404

418405
if "lang_id_score" in columns:
419406
cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
420-
display_dataset(
421-
cond_filter,
422-
"Discarded documents for the filter on the language identification confidence score",
423-
)
407+
Visualization.display_dataset(self.docs, cond_filter, "Discarded documents for the filter on the language identification confidence score", "docs")
424408

425409
if "perplexity_score" in columns:
426410
cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
427-
display_dataset(
428-
cond_filter,
429-
"Discarded documents for the filter on the perplexity score",
430-
)
411+
Visualization.display_dataset(self.docs, cond_filter, "Discarded documents for the filter on the perplexity score", "docs")
431412

432-
display_dataset(all_conds, "Retained documents")
413+
Visualization.display_dataset(self.docs, all_conds, "Retained documents", "docs")
433414

434415
st.header("Download data")
435416

@@ -442,57 +423,70 @@ def display_dataset(cond, description):
442423

443424
def filtering_of_words(self):
444425
if not (self.words is None):
445-
st.sidebar.subheader("Parameter of the filtering on words")
426+
columns = list(self.words)
446427

447-
with st.sidebar.expander("Length of words"):
448-
cutoff_def = "If the length of a word is higher than this number, the word is removed."
449-
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
450-
cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
451-
new_key = ("len_word", cutoff_word, True)
452-
self.parameters.append(new_key)
453-
Visualization.plot_hist(self.words, new_key)
454-
455-
with st.sidebar.expander("Words with incorrect substrings"):
456-
incorrect_substrings = st.checkbox(
457-
"Remove words with incorrect substrings."
458-
)
459-
self.parameters.append(("incorrect_substrings", incorrect_substrings))
428+
st.sidebar.subheader("Parameter of the filtering on words")
460429

461-
cond_words = self.words["len_word"] <= cutoff_word
462-
if incorrect_substrings:
463-
cond_words = cond_words & np.invert(
464-
self.words["incorrect_substring"]
430+
conds_words = {}
431+
432+
if "len_word" in columns:
433+
with st.sidebar.expander("Length of words"):
434+
cutoff_def = "If the length of a word is higher than this number, the word is removed."
435+
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
436+
cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
437+
new_key = ("len_word", cutoff_word, True)
438+
self.parameters.append(new_key)
439+
Visualization.plot_hist(self.words, new_key)
440+
cond_len_words = self.words["len_word"] <= cutoff_word
441+
Visualization.print_discarded_by_cond(cond_len_words)
442+
conds_words["len_word"] = cond_len_words
443+
444+
if "incorrect_substrings" in columns:
445+
with st.sidebar.expander("Words with incorrect substrings"):
446+
incorrect_substrings = st.checkbox(
447+
"Remove words with incorrect substrings."
465448
)
449+
self.parameters.append(("incorrect_substrings", incorrect_substrings))
450+
451+
if incorrect_substrings:
452+
cond_incorrect_substrings = np.invert(self.words["incorrect_substrings"])
453+
else:
454+
cond_incorrect_substrings = np.array([True for i in range(len(self.words["incorrect_substrings"]))])
455+
Visualization.print_discarded_by_cond(cond_incorrect_substrings)
456+
conds_words["incorrect_substrings"] = cond_incorrect_substrings
457+
458+
all_conds_words = np.all(list(conds_words.values()), axis=0)
466459

467460
with st.expander(
468-
f"Filtering on words, for {self.num_docs} {self.lang} documents"
461+
f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
469462
):
470463
st.header(
471-
f"Filtering on words, for {self.num_docs} {self.lang} documents"
464+
f"Filtering on words, for {self.num_docs_for_words} {self.lang} documents"
472465
)
473466

474467
st.markdown(
475468
f"Since the number of words is way larger than the number of documents, "
476-
f"we consider in this section words for the first {self.num_docs_for_words} documents only."
469+
f"we consider in this section words for only {self.num_docs_for_words} documents."
477470
)
478471

479-
discarded_words = self.words.loc[np.invert(cond_words)]
480-
st.subheader(
481-
f"Discarded words: {len(discarded_words)} words ({len(discarded_words) / len(self.words) * 100:.2f}%)"
482-
)
483-
st.markdown(
484-
"Click on a column to sort by it, place the cursor on the text to display it."
485-
)
486-
st.dataframe(discarded_words)
472+
Visualization.display_dataset(self.words, np.invert(all_conds_words), "Discarded words", "words")
487473

488-
retained_words = self.words.loc[cond_words]
489-
st.subheader(
490-
f"Retained words: {len(retained_words)} words ({len(retained_words) / len(self.words) * 100:.2f}%)"
491-
)
492-
st.markdown(
493-
"Click on a column to sort by it, place the cursor on the text to display it."
474+
# st.subheader("Display discarded words by filter")
475+
display_discarded_words_by_filter = st.checkbox(
476+
"Display discarded words by filter"
494477
)
495-
st.dataframe(retained_words)
478+
479+
if display_discarded_words_by_filter:
480+
481+
if "len_word" in columns:
482+
cond_filter = np.invert(conds_words["len_word"])
483+
Visualization.display_dataset(self.words, cond_filter, "Discarded words for the filter on length", "words")
484+
485+
if "incorrect_substrings" in columns:
486+
cond_filter = np.invert(conds_words["incorrect_substrings"])
487+
Visualization.display_dataset(self.words, cond_filter, "Discarded words for the filter on incorrect substrings", "words")
488+
489+
Visualization.display_dataset(self.words, all_conds_words, "Retained words", "words")
496490

497491
def download_parameters(self):
498492
st.sidebar.subheader("Download parameters")

0 commit comments

Comments
 (0)