Skip to content

Commit bf3ddf0

Browse files
committed
button to download parameters
1 parent 842d731 commit bf3ddf0

1 file changed

Lines changed: 114 additions & 88 deletions

File tree

ac_dc/visualization/visualization.py

Lines changed: 114 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,7 @@ def print_discared_by_cond(cond):
171171
if "10" in val_repetitions_lengths
172172
else 0
173173
)
174-
label_selectbox = (
175-
"Length of the repetitions (that will determine the repetitions ratio)."
176-
)
174+
label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
177175
repetitions_length = st.sidebar.selectbox(
178176
label=label_selectbox,
179177
options=val_repetitions_lengths,
@@ -270,6 +268,7 @@ def print_discared_by_cond(cond):
270268
return keys, conds
271269

272270
self.keys, conds = set_sliders()
271+
self.parameters = self.keys * 1
273272

274273
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
275274
all_conds = np.all(all_conds, axis=0)
@@ -356,10 +355,14 @@ def filtering_of_words(self):
356355
cutoff_def = "If the length of a word is higher than this number, the word is removed."
357356
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
358357
cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
358+
self.parameters.append(("len_word", cutoff_word, True))
359+
st.sidebar.caption("---------")
359360

360361
incorrect_substrings = st.sidebar.checkbox(
361362
"Remove words with incorrect substrings."
362363
)
364+
self.parameters.append(("incorrect_substrings", incorrect_substrings))
365+
st.sidebar.caption("---------")
363366

364367
cond_words = self.words["len_word"] <= cutoff_word
365368
if incorrect_substrings:
@@ -390,6 +393,13 @@ def filtering_of_words(self):
390393
)
391394
st.dataframe(retained_words)
392395

396+
def download_parameters(self):
397+
btn = st.sidebar.download_button(
398+
label="Download current parameters as json",
399+
data=json.dumps(self.parameters),
400+
file_name=f"parameters_{self.lang_dataset_id}.json",
401+
)
402+
393403
def plot_distributions_filtering_parameters(self):
394404
st.header("Distributions of the filtering parameters")
395405

@@ -446,94 +456,109 @@ def analyse_personal_doc(self):
446456
is_discarded = False
447457

448458
def is_doc_discarded(key, score):
449-
if key[2]: # max cutoff
459+
if key[2]: # max cutoff
450460
return score > key[1]
451461
else:
452462
return score < key[1]
453463

454-
st.markdown("Statistics of the document:")
455-
456-
for key in self.keys:
457-
if key[0] == "number_words":
458-
words = ModifyingDocuments.get_words_from_document(
459-
personal_doc,
460-
self.sentencepiece_model_tok,
461-
lower_case=False,
462-
strip_characters=self.param["strip_characters"],
463-
)
464-
if key[2]:
465-
st.markdown(f"Number of words: {len(words)}")
466-
if is_doc_discarded(key, len(words)):
467-
is_discarded = True
468-
469-
elif key[0] == "repetitions_ratio":
470-
repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
471-
repetitions_ratio = round(repetitions_ratio, 3)
472-
st.markdown(f"Repetitions ratio: {repetitions_ratio}")
473-
if is_doc_discarded(key, repetitions_ratio):
474-
is_discarded = True
475-
476-
elif key[0] == "special_characters_ratio":
477-
special_characters_ratio = Filtering.compute_special_characters_ratio(
478-
personal_doc, self.param["special_characters"]
479-
)
480-
special_characters_ratio = round(special_characters_ratio, 3)
481-
st.markdown(f"Special characters ratio: {special_characters_ratio}")
482-
if is_doc_discarded(key, special_characters_ratio):
483-
is_discarded = True
484-
485-
elif key[0] == "stopwords_ratio":
486-
stopwords_ratio = Filtering.compute_stopwords_ratio(
487-
personal_doc,
488-
self.sentencepiece_model_tok,
489-
self.param["strip_characters"],
490-
self.param["cond_words_augmentation"],
491-
self.param["words_augmentation_group_sizes"],
492-
self.param["words_augmentation_join_char"],
493-
self.stopwords,
494-
)
495-
stopwords_ratio = round(stopwords_ratio, 3)
496-
st.markdown(f"Stop words ratio: {stopwords_ratio}")
497-
if is_doc_discarded(key, stopwords_ratio):
498-
is_discarded = True
499-
500-
elif key[0] == "badwords_ratio":
501-
badwords_ratio = Filtering.compute_badwords_ratio(
502-
personal_doc,
503-
self.sentencepiece_model_tok,
504-
self.param["strip_characters"],
505-
self.param["cond_words_augmentation"],
506-
self.param["words_augmentation_group_sizes"],
507-
self.param["words_augmentation_join_char"],
508-
self.badwords,
509-
)
510-
badwords_ratio = round(badwords_ratio, 3)
511-
st.markdown(f"Flagged words ratio: {badwords_ratio}")
512-
if is_doc_discarded(key, badwords_ratio):
513-
is_discarded = True
514-
515-
elif key[0] == "lang_id_score":
516-
lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
517-
personal_doc, self.model_lang_id
518-
)
519-
lang_id_score = round(lang_id_score, 3)
520-
st.markdown(f"Language identification confidence score: {lang_id_score}")
521-
if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
522-
is_discarded = True
523-
524-
elif key[0] == "perplexity_score":
525-
perplexity_score = Filtering.compute_perplexity_score(
526-
personal_doc,
527-
self.sentencepiece_model,
528-
self.kenlm_model,
529-
)
530-
perplexity_score = round(perplexity_score, 3)
531-
st.markdown(f"Perplexity score: {perplexity_score}")
532-
if is_doc_discarded(key, perplexity_score):
533-
is_discarded = True
534-
535-
is_discarded = "" if is_discarded else "not "
536-
st.markdown(f"With the current filtering parameters, this document **is {is_discarded}discarded**.")
464+
if personal_doc:
465+
466+
st.markdown("Statistics of the document:")
467+
468+
for key in self.keys:
469+
if key[0] == "number_words":
470+
words = ModifyingDocuments.get_words_from_document(
471+
personal_doc,
472+
self.sentencepiece_model_tok,
473+
lower_case=False,
474+
strip_characters=self.param["strip_characters"],
475+
)
476+
if key[2]:
477+
st.markdown(f"Number of words: {len(words)}")
478+
if is_doc_discarded(key, len(words)):
479+
is_discarded = True
480+
481+
elif key[0] == "repetitions_ratio":
482+
repetitions_ratio = Filtering.compute_repetitions_ratio(
483+
personal_doc, int(key[3])
484+
)
485+
repetitions_ratio = round(repetitions_ratio, 3)
486+
st.markdown(f"Repetitions ratio: {repetitions_ratio}")
487+
if is_doc_discarded(key, repetitions_ratio):
488+
is_discarded = True
489+
490+
elif key[0] == "special_characters_ratio":
491+
special_characters_ratio = (
492+
Filtering.compute_special_characters_ratio(
493+
personal_doc, self.param["special_characters"]
494+
)
495+
)
496+
special_characters_ratio = round(special_characters_ratio, 3)
497+
st.markdown(f"Special characters ratio: {special_characters_ratio}")
498+
if is_doc_discarded(key, special_characters_ratio):
499+
is_discarded = True
500+
501+
elif key[0] == "stopwords_ratio":
502+
stopwords_ratio = Filtering.compute_stopwords_ratio(
503+
personal_doc,
504+
self.sentencepiece_model_tok,
505+
self.param["strip_characters"],
506+
self.param["cond_words_augmentation"],
507+
self.param["words_augmentation_group_sizes"],
508+
self.param["words_augmentation_join_char"],
509+
self.stopwords,
510+
)
511+
stopwords_ratio = round(stopwords_ratio, 3)
512+
st.markdown(f"Stop words ratio: {stopwords_ratio}")
513+
if is_doc_discarded(key, stopwords_ratio):
514+
is_discarded = True
515+
516+
elif key[0] == "badwords_ratio":
517+
badwords_ratio = Filtering.compute_badwords_ratio(
518+
personal_doc,
519+
self.sentencepiece_model_tok,
520+
self.param["strip_characters"],
521+
self.param["cond_words_augmentation"],
522+
self.param["words_augmentation_group_sizes"],
523+
self.param["words_augmentation_join_char"],
524+
self.badwords,
525+
)
526+
badwords_ratio = round(badwords_ratio, 3)
527+
st.markdown(f"Flagged words ratio: {badwords_ratio}")
528+
if is_doc_discarded(key, badwords_ratio):
529+
is_discarded = True
530+
531+
elif key[0] == "lang_id_score":
532+
(
533+
lang_pred_dataset_id,
534+
lang_id_score,
535+
) = Filtering.compute_lang_id_pred_score(
536+
personal_doc, self.model_lang_id
537+
)
538+
lang_id_score = round(lang_id_score, 3)
539+
st.markdown(
540+
f"Language identification confidence score: {lang_id_score}"
541+
)
542+
if is_doc_discarded(key, badwords_ratio) or (
543+
self.lang_dataset_id != lang_pred_dataset_id
544+
):
545+
is_discarded = True
546+
547+
elif key[0] == "perplexity_score":
548+
perplexity_score = Filtering.compute_perplexity_score(
549+
personal_doc,
550+
self.sentencepiece_model,
551+
self.kenlm_model,
552+
)
553+
perplexity_score = round(perplexity_score, 3)
554+
st.markdown(f"Perplexity score: {perplexity_score}")
555+
if is_doc_discarded(key, perplexity_score):
556+
is_discarded = True
557+
558+
is_discarded = "" if is_discarded else "not "
559+
st.markdown(
560+
f"With the current filtering parameters, this document **is {is_discarded}discarded**."
561+
)
537562

538563
def download_data(self):
539564
st.header("Download data")
@@ -546,12 +571,13 @@ def download_data(self):
546571
)
547572

548573
def visualization(self):
549-
#self.warning_preamble()
574+
# self.warning_preamble()
550575
self.preamble()
551576
self.open_data()
552577
self.set_title()
553578
self.filtering_of_docs()
554579
self.filtering_of_words()
580+
self.download_parameters()
555581
self.plot_distributions_filtering_parameters()
556582
self.plot_zipf_law()
557583
self.analyse_personal_doc()

0 commit comments

Comments
 (0)