Skip to content

Commit 3a02aea

Browse files
committed
New tool to analyse our own document in the streamlit visualization
1 parent cb5a0b8 commit 3a02aea

3 files changed

Lines changed: 168 additions & 9 deletions

File tree

31 Bytes
Binary file not shown.

ac_dc/visualization/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ Use this visualization tool online at https://huggingface.co/spaces/huggingface/
55
However, by running the code on your computer, it is faster, it can handle in practice up to three times more documents, and it works for every language.
66

77
1) Use get_data_for_visualization.py to get the json gathering examples with their computed statistics for the language you chose.
8-
It uses the streaming mode of the Datasets library, so no need to download the dataset, but you have to download the fasttext model (for the language identification) and the kenlm / sentencepiece models (for the perplexity).
8+
It uses the streaming mode of the Datasets library, so no need to download the dataset, but you have to download the fasttext model (for the language identification) and the sentencepiece / kenlm models (for the tokenization and the perplexity).
99

10-
2) Specify the path to this json in visualization.py and run the command "streamlit run visualization.py".
10+
2) Specify the path to this json and the fasttext / sentencepiece / kenlm models in visualization.py and run the command "streamlit run ac_dc/visualization/visualization.py".

ac_dc/visualization/visualization.py

Lines changed: 166 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,23 @@
77
import base64
88
import json
99
import pandas as pd
10+
1011
pd.options.mode.chained_assignment = None
1112

1213
import numpy as np
1314

1415
import matplotlib.pyplot as plt
1516

17+
import sys
18+
from pathlib import Path
19+
20+
sys.path.insert(1, os.path.join(sys.path[0], ".."))
21+
# Append the path of the ac_dc directory to the python path
22+
# to find the file filtering.py in the parent directory
23+
sys.path.append(str(Path(sys.path[0]).parent.absolute().parent.absolute()))
24+
25+
from filtering import LoadParameters, ModifyingDocuments, Filtering
26+
1627

1728
class Visualization:
1829
def __init__(
@@ -23,6 +34,10 @@ def __init__(
2334
num_docs,
2435
num_docs_for_words,
2536
max_len_text_display,
37+
lang_dataset_id,
38+
path_fasttext_model,
39+
path_sentencepiece_model,
40+
path_kenlm_model,
2641
):
2742
self.path_instructions = path_instructions
2843
self.path_data = path_data
@@ -31,6 +46,32 @@ def __init__(
3146
self.num_docs_for_words = num_docs_for_words
3247
self.max_len_text_display = max_len_text_display
3348

49+
self.lang_dataset_id = lang_dataset_id
50+
self.param = LoadParameters.load_parameters(lang_dataset_id)
51+
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
52+
self.badwords = LoadParameters.load_badwords(lang_dataset_id)
53+
self.model_lang_id = LoadParameters.load_model_lang_id(
54+
lang_dataset_id, path_fasttext_model
55+
)
56+
self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
57+
lang_dataset_id, path_sentencepiece_model
58+
)
59+
self.sentencepiece_model_tok = (
60+
self.sentencepiece_model if self.param["tokenization"] else None
61+
)
62+
self.kenlm_model = LoadParameters.load_kenlm_model(
63+
lang_dataset_id, path_kenlm_model
64+
)
65+
66+
def warning_preamble(self):
67+
st.markdown(
68+
"This demo can be a little slow, and only allows you to process up to 5000 documents "
69+
"for a decent speed. If you want to display up to three times more documents and have "
70+
"a faster visualization, we invite you to run this "
71+
"[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
72+
"on your computer."
73+
)
74+
3475
def preamble(self):
3576
st.markdown(
3677
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
@@ -131,21 +172,25 @@ def print_discared_by_cond(cond):
131172
else 0
132173
)
133174
label_selectbox = (
134-
"Length of the repetitions (that will determine the repetitions ratio). "
135-
"Choosing a higher or lower number does not mean that the filtering "
136-
"is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
137-
"tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
138-
"few or no repetitions, simply because their length gives them more diversity, and we do "
139-
"not want to discard such documents."
175+
"Length of the repetitions (that will determine the repetitions ratio)."
140176
)
141177
repetitions_length = st.sidebar.selectbox(
142178
label=label_selectbox,
143179
options=val_repetitions_lengths,
144180
index=default_index,
145181
)
182+
st.sidebar.caption(
183+
"Choosing a higher or lower number does not mean that the filtering "
184+
"is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
185+
"tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
186+
"few or no repetitions, simply because their length gives them more diversity, and we do "
187+
"not want to discard such documents."
188+
)
146189
self.docs = self.docs_checkpoint
147190
for i in range(len(self.docs["repetitions_ratio"])):
148-
self.docs["repetitions_ratio"].iloc[i] = self.docs["repetitions_ratio"].iloc[i][repetitions_length]
191+
self.docs["repetitions_ratio"].iloc[i] = self.docs[
192+
"repetitions_ratio"
193+
].iloc[i][repetitions_length]
149194

150195
cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
151196
cutoff_repetitions_ratio = st.sidebar.slider(
@@ -155,6 +200,7 @@ def print_discared_by_cond(cond):
155200
"repetitions_ratio",
156201
cutoff_repetitions_ratio,
157202
True,
203+
repetitions_length,
158204
)
159205
keys.append(new_key)
160206
cond = get_cond(new_key[0], new_key[1], new_key[2])
@@ -388,6 +434,107 @@ def plot_zipf_law(self):
388434
ax.set_ylabel("frequency in the documents")
389435
st.pyplot(fig)
390436

437+
def analyse_personal_doc(self):
438+
st.header("Analyse your own document")
439+
440+
personal_doc = st.text_area(
441+
label="Paste here the document you want to analyse",
442+
value="",
443+
max_chars=10000,
444+
)
445+
446+
is_discarded = False
447+
448+
def is_doc_discarded(key, score):
449+
if key[2]: # max cutoff
450+
return score > key[1]
451+
else:
452+
return score < key[1]
453+
454+
st.markdown("Statistics of the document:")
455+
456+
for key in self.keys:
457+
if key[0] == "number_words":
458+
words = ModifyingDocuments.get_words_from_document(
459+
personal_doc,
460+
self.sentencepiece_model_tok,
461+
lower_case=False,
462+
strip_characters=self.param["strip_characters"],
463+
)
464+
if key[2]:
465+
st.markdown(f"Number of words: {len(words)}")
466+
if is_doc_discarded(key, len(words)):
467+
is_discarded = True
468+
469+
elif key[0] == "repetitions_ratio":
470+
repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
471+
repetitions_ratio = round(repetitions_ratio, 3)
472+
st.markdown(f"Repetitions ratio: {repetitions_ratio}")
473+
if is_doc_discarded(key, repetitions_ratio):
474+
is_discarded = True
475+
476+
elif key[0] == "special_characters_ratio":
477+
special_characters_ratio = Filtering.compute_special_characters_ratio(
478+
personal_doc, self.param["special_characters"]
479+
)
480+
special_characters_ratio = round(special_characters_ratio, 3)
481+
st.markdown(f"Special characters ratio: {special_characters_ratio}")
482+
if is_doc_discarded(key, special_characters_ratio):
483+
is_discarded = True
484+
485+
elif key[0] == "stopwords_ratio":
486+
stopwords_ratio = Filtering.compute_stopwords_ratio(
487+
personal_doc,
488+
self.sentencepiece_model_tok,
489+
self.param["strip_characters"],
490+
self.param["cond_words_augmentation"],
491+
self.param["words_augmentation_group_sizes"],
492+
self.param["words_augmentation_join_char"],
493+
self.stopwords,
494+
)
495+
stopwords_ratio = round(stopwords_ratio, 3)
496+
st.markdown(f"Stop words ratio: {stopwords_ratio}")
497+
if is_doc_discarded(key, stopwords_ratio):
498+
is_discarded = True
499+
500+
elif key[0] == "badwords_ratio":
501+
badwords_ratio = Filtering.compute_badwords_ratio(
502+
personal_doc,
503+
self.sentencepiece_model_tok,
504+
self.param["strip_characters"],
505+
self.param["cond_words_augmentation"],
506+
self.param["words_augmentation_group_sizes"],
507+
self.param["words_augmentation_join_char"],
508+
self.badwords,
509+
)
510+
badwords_ratio = round(badwords_ratio, 3)
511+
st.markdown(f"Flagged words ratio: {badwords_ratio}")
512+
if is_doc_discarded(key, badwords_ratio):
513+
is_discarded = True
514+
515+
elif key[0] == "lang_id_score":
516+
lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
517+
personal_doc, self.model_lang_id
518+
)
519+
lang_id_score = round(lang_id_score, 3)
520+
st.markdown(f"Language identification confidence score: {lang_id_score}")
521+
if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
522+
is_discarded = True
523+
524+
elif key[0] == "perplexity_score":
525+
perplexity_score = Filtering.compute_perplexity_score(
526+
personal_doc,
527+
self.sentencepiece_model,
528+
self.kenlm_model,
529+
)
530+
perplexity_score = round(perplexity_score, 3)
531+
st.markdown(f"Perplexity score: {perplexity_score}")
532+
if is_doc_discarded(key, perplexity_score):
533+
is_discarded = True
534+
535+
is_discarded = "" if is_discarded else "not "
536+
st.markdown(f"With the current filtering parameters, this document **is {is_discarded}discarded**.")
537+
391538
def download_data(self):
392539
st.header("Download data")
393540

@@ -399,13 +546,15 @@ def download_data(self):
399546
)
400547

401548
def visualization(self):
549+
#self.warning_preamble()
402550
self.preamble()
403551
self.open_data()
404552
self.set_title()
405553
self.filtering_of_docs()
406554
self.filtering_of_words()
407555
self.plot_distributions_filtering_parameters()
408556
self.plot_zipf_law()
557+
self.analyse_personal_doc()
409558
self.download_data()
410559

411560

@@ -416,12 +565,22 @@ def visualization(self):
416565
num_docs_for_words = 1500
417566
max_len_text_display = 10000
418567

568+
# Only useful for analyse_personal_doc
569+
lang_dataset_id = "en"
570+
path_fasttext_model = "./ac_dc/lid.176.bin"
571+
path_sentencepiece_model = "./ac_dc/en.sp.model"
572+
path_kenlm_model = "./ac_dc/en.arpa.bin"
573+
419574
visualization = Visualization(
420575
path_instructions,
421576
path_data,
422577
lang,
423578
num_docs,
424579
num_docs_for_words,
425580
max_len_text_display,
581+
lang_dataset_id,
582+
path_fasttext_model,
583+
path_sentencepiece_model,
584+
path_kenlm_model,
426585
)
427586
visualization.visualization()

0 commit comments

Comments
 (0)