Skip to content

Commit 9aa646f

Browse files
committed
visualization: upload our own stop words and flagged words list
1 parent 0e8546d commit 9aa646f

1 file changed

Lines changed: 32 additions & 1 deletion

File tree

ac_dc/visualization/visualization.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import os
66

7+
from io import StringIO
78
import base64
89
import json
910
import pandas as pd
@@ -201,7 +202,7 @@ def print_discared_by_cond(cond):
201202
"few or no repetitions, simply because their length gives them more diversity, and we do "
202203
"not want to discard such documents."
203204
)
204-
self.docs = self.docs_checkpoint
205+
self.docs["repetitions_ratio"] = self.docs_checkpoint["repetitions_ratio"]
205206
for i in range(len(self.docs["repetitions_ratio"])):
206207
self.docs["repetitions_ratio"].iloc[i] = self.docs[
207208
"repetitions_ratio"
@@ -242,6 +243,21 @@ def print_discared_by_cond(cond):
242243

243244
if "stopwords_ratio" in columns:
244245
with st.sidebar.expander("Stop words ratio"):
246+
stopwords_file = st.file_uploader("Upload your own list of stop words (one per line). If there is none, the default one is used.")
247+
if stopwords_file:
248+
new_stopwords = StringIO(stopwords_file.getvalue().decode("utf-8")).read()
249+
new_stopwords = set(new_stopwords.split("\n"))
250+
self.docs["stopwords_ratio"] = self.docs_checkpoint["stopwords_ratio"]
251+
for i in range(len(self.docs["stopwords_ratio"])):
252+
self.docs["stopwords_ratio"].iloc[i] = Filtering.compute_stopwords_ratio(
253+
self.docs["text"].iloc[i],
254+
self.sentencepiece_model_tok,
255+
self.param["strip_characters"],
256+
self.param["cond_words_augmentation"],
257+
self.param["words_augmentation_group_sizes"],
258+
self.param["words_augmentation_join_char"],
259+
new_stopwords,
260+
)
245261
cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
246262
cutoff_stopwords_ratio = st.slider(
247263
cutoff_def, 0.0, 1.0, 0.0, step=0.01
@@ -255,6 +271,21 @@ def print_discared_by_cond(cond):
255271

256272
if "flagged_words_ratio" in columns:
257273
with st.sidebar.expander("Flagged words ratio"):
274+
flagged_words_file = st.file_uploader("Upload your own list of flagged words (one per line). If there is none, the default one is used.")
275+
if flagged_words_file:
276+
new_flagged_words = StringIO(flagged_words_file.getvalue().decode("utf-8")).read()
277+
new_flagged_words = set(new_flagged_words.split("\n"))
278+
self.docs["flagged_words_ratio"] = self.docs_checkpoint["flagged_words_ratio"]
279+
for i in range(len(self.docs["flagged_words_ratio"])):
280+
self.docs["flagged_words_ratio"].iloc[i] = Filtering.compute_flagged_words_ratio(
281+
self.docs["text"].iloc[i],
282+
self.sentencepiece_model_tok,
283+
self.param["strip_characters"],
284+
self.param["cond_words_augmentation"],
285+
self.param["words_augmentation_group_sizes"],
286+
self.param["words_augmentation_join_char"],
287+
new_flagged_words,
288+
)
258289
cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
259290
cutoff_flagged_words_ratio = st.slider(
260291
cutoff_def, 0.0, 1.0, 1.0, step=0.01

0 commit comments

Comments
 (0)