New tool to analyse our own document in the streamlit visualization

HugoLaurencon · HugoLaurencon · commit 3a02aea8248a · 2022-01-13T15:38:37.000+01:00
diff --git a/ac_dc/explanation_filtering_pipeline.pdf b/ac_dc/explanation_filtering_pipeline.pdf
diff --git a/ac_dc/visualization/README.md b/ac_dc/visualization/README.md
@@ -5,6 +5,6 @@ Use this visualization tool online at https://huggingface.co/spaces/huggingface/
 However, by running the code on your computer, it is faster, it can handle in practice up to three times more documents, and it works for every language.
 
 1) Use get_data_for_visualization.py to get the json gathering examples with their computed statistics for the language you chose.
-It uses the streaming mode of the Datasets library, so no need to download the dataset, but you have to download the fasttext model (for the language identification) and the kenlm / sentencepiece models (for the perplexity).
+It uses the streaming mode of the Datasets library, so no need to download the dataset, but you have to download the fasttext model (for the language identification) and the sentencepiece / kenlm models (for the tokenization and the perplexity).
 
-2) Specify the path to this json in visualization.py and run the command "streamlit run visualization.py".
+2) Specify the path to this json and the fasttext / sentencepiece / kenlm models in visualization.py and run the command "streamlit run ac_dc/visualization/visualization.py".
diff --git a/ac_dc/visualization/visualization.py b/ac_dc/visualization/visualization.py
@@ -7,12 +7,23 @@
 import base64
 import json
 import pandas as pd
+
 pd.options.mode.chained_assignment = None
 
 import numpy as np
 
 import matplotlib.pyplot as plt
 
+import sys
+from pathlib import Path
+
+sys.path.insert(1, os.path.join(sys.path[0], ".."))
+# Append the path of the ac_dc directory to the python path
+# to find the file filtering.py in the parent directory
+sys.path.append(str(Path(sys.path[0]).parent.absolute().parent.absolute()))
+
+from filtering import LoadParameters, ModifyingDocuments, Filtering
+
 
 class Visualization:
     def __init__(
@@ -23,6 +34,10 @@ def __init__(
         num_docs,
         num_docs_for_words,
         max_len_text_display,
+        lang_dataset_id,
+        path_fasttext_model,
+        path_sentencepiece_model,
+        path_kenlm_model,
     ):
         self.path_instructions = path_instructions
         self.path_data = path_data
@@ -31,6 +46,32 @@ def __init__(
         self.num_docs_for_words = num_docs_for_words
         self.max_len_text_display = max_len_text_display
 
+        self.lang_dataset_id = lang_dataset_id
+        self.param = LoadParameters.load_parameters(lang_dataset_id)
+        self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
+        self.badwords = LoadParameters.load_badwords(lang_dataset_id)
+        self.model_lang_id = LoadParameters.load_model_lang_id(
+            lang_dataset_id, path_fasttext_model
+        )
+        self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
+            lang_dataset_id, path_sentencepiece_model
+        )
+        self.sentencepiece_model_tok = (
+            self.sentencepiece_model if self.param["tokenization"] else None
+        )
+        self.kenlm_model = LoadParameters.load_kenlm_model(
+            lang_dataset_id, path_kenlm_model
+        )
+
+    def warning_preamble(self):
+        st.markdown(
+            "This demo can be a little slow, and only allows you to process up to 5000 documents "
+            "for a decent speed. If you want to display up to three times more documents and have "
+            "a faster visualization, we invite you to run this "
+            "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
+            "on your computer."
+        )
+
     def preamble(self):
         st.markdown(
             "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
@@ -131,21 +172,25 @@ def print_discared_by_cond(cond):
                     else 0
                 )
                 label_selectbox = (
-                    "Length of the repetitions (that will determine the repetitions ratio). "
-                    "Choosing a higher or lower number does not mean that the filtering "
-                    "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
-                    "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
-                    "few or no repetitions, simply because their length gives them more diversity, and we do "
-                    "not want to discard such documents."
+                    "Length of the repetitions (that will determine the repetitions ratio)."
                 )
                 repetitions_length = st.sidebar.selectbox(
                     label=label_selectbox,
                     options=val_repetitions_lengths,
                     index=default_index,
                 )
+                st.sidebar.caption(
+                    "Choosing a higher or lower number does not mean that the filtering "
+                    "is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
+                    "tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
+                    "few or no repetitions, simply because their length gives them more diversity, and we do "
+                    "not want to discard such documents."
+                )
                 self.docs = self.docs_checkpoint
                 for i in range(len(self.docs["repetitions_ratio"])):
-                    self.docs["repetitions_ratio"].iloc[i] = self.docs["repetitions_ratio"].iloc[i][repetitions_length]
+                    self.docs["repetitions_ratio"].iloc[i] = self.docs[
+                        "repetitions_ratio"
+                    ].iloc[i][repetitions_length]
 
                 cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
                 cutoff_repetitions_ratio = st.sidebar.slider(
@@ -155,6 +200,7 @@ def print_discared_by_cond(cond):
                     "repetitions_ratio",
                     cutoff_repetitions_ratio,
                     True,
+                    repetitions_length,
                 )
                 keys.append(new_key)
                 cond = get_cond(new_key[0], new_key[1], new_key[2])
@@ -388,6 +434,107 @@ def plot_zipf_law(self):
                 ax.set_ylabel("frequency in the documents")
                 st.pyplot(fig)
 
+    def analyse_personal_doc(self):
+        st.header("Analyse your own document")
+
+        personal_doc = st.text_area(
+            label="Paste here the document you want to analyse",
+            value="",
+            max_chars=10000,
+        )
+
+        is_discarded = False
+
+        def is_doc_discarded(key, score):
+            if key[2]: # max cutoff
+                return score > key[1]
+            else:
+                return score < key[1]
+
+        st.markdown("Statistics of the document:")
+
+        for key in self.keys:
+            if key[0] == "number_words":
+                words = ModifyingDocuments.get_words_from_document(
+                    personal_doc,
+                    self.sentencepiece_model_tok,
+                    lower_case=False,
+                    strip_characters=self.param["strip_characters"],
+                )
+                if key[2]:
+                    st.markdown(f"Number of words: {len(words)}")
+                if is_doc_discarded(key, len(words)):
+                    is_discarded = True
+
+            elif key[0] == "repetitions_ratio":
+                repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
+                repetitions_ratio = round(repetitions_ratio, 3)
+                st.markdown(f"Repetitions ratio: {repetitions_ratio}")
+                if is_doc_discarded(key, repetitions_ratio):
+                    is_discarded = True
+
+            elif key[0] == "special_characters_ratio":
+                special_characters_ratio = Filtering.compute_special_characters_ratio(
+                    personal_doc, self.param["special_characters"]
+                )
+                special_characters_ratio = round(special_characters_ratio, 3)
+                st.markdown(f"Special characters ratio: {special_characters_ratio}")
+                if is_doc_discarded(key, special_characters_ratio):
+                    is_discarded = True
+
+            elif key[0] == "stopwords_ratio":
+                stopwords_ratio = Filtering.compute_stopwords_ratio(
+                    personal_doc,
+                    self.sentencepiece_model_tok,
+                    self.param["strip_characters"],
+                    self.param["cond_words_augmentation"],
+                    self.param["words_augmentation_group_sizes"],
+                    self.param["words_augmentation_join_char"],
+                    self.stopwords,
+                )
+                stopwords_ratio = round(stopwords_ratio, 3)
+                st.markdown(f"Stop words ratio: {stopwords_ratio}")
+                if is_doc_discarded(key, stopwords_ratio):
+                    is_discarded = True
+
+            elif key[0] == "badwords_ratio":
+                badwords_ratio = Filtering.compute_badwords_ratio(
+                    personal_doc,
+                    self.sentencepiece_model_tok,
+                    self.param["strip_characters"],
+                    self.param["cond_words_augmentation"],
+                    self.param["words_augmentation_group_sizes"],
+                    self.param["words_augmentation_join_char"],
+                    self.badwords,
+                )
+                badwords_ratio = round(badwords_ratio, 3)
+                st.markdown(f"Flagged words ratio: {badwords_ratio}")
+                if is_doc_discarded(key, badwords_ratio):
+                    is_discarded = True
+
+            elif key[0] == "lang_id_score":
+                lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
+                    personal_doc, self.model_lang_id
+                )
+                lang_id_score = round(lang_id_score, 3)
+                st.markdown(f"Language identification confidence score: {lang_id_score}")
+                if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
+                    is_discarded = True
+
+            elif key[0] == "perplexity_score":
+                perplexity_score = Filtering.compute_perplexity_score(
+                    personal_doc,
+                    self.sentencepiece_model,
+                    self.kenlm_model,
+                )
+                perplexity_score = round(perplexity_score, 3)
+                st.markdown(f"Perplexity score: {perplexity_score}")
+                if is_doc_discarded(key, perplexity_score):
+                    is_discarded = True
+
+        is_discarded = "" if is_discarded else "not "
+        st.markdown(f"With the current filtering parameters, this document **is {is_discarded}discarded**.")
+
     def download_data(self):
         st.header("Download data")
 
@@ -399,13 +546,15 @@ def download_data(self):
             )
 
     def visualization(self):
+        #self.warning_preamble()
         self.preamble()
         self.open_data()
         self.set_title()
         self.filtering_of_docs()
         self.filtering_of_words()
         self.plot_distributions_filtering_parameters()
         self.plot_zipf_law()
+        self.analyse_personal_doc()
         self.download_data()
 
 
@@ -416,12 +565,22 @@ def visualization(self):
 num_docs_for_words = 1500
 max_len_text_display = 10000
 
+# Only useful for analyse_personal_doc
+lang_dataset_id = "en"
+path_fasttext_model = "./ac_dc/lid.176.bin"
+path_sentencepiece_model = "./ac_dc/en.sp.model"
+path_kenlm_model = "./ac_dc/en.arpa.bin"
+
 visualization = Visualization(
     path_instructions,
     path_data,
     lang,
     num_docs,
     num_docs_for_words,
     max_len_text_display,
+    lang_dataset_id,
+    path_fasttext_model,
+    path_sentencepiece_model,
+    path_kenlm_model,
 )
 visualization.visualization()