Skip to content

Commit 7f12407

Browse files
committed
visualization includes new repetition removal filter
1 parent 659054d commit 7f12407

2 files changed

Lines changed: 61 additions & 6 deletions

File tree

ac_dc/visualization/get_data_for_visualization.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,12 @@ def compute_stats(self):
8888
number_words = len(words)
8989
stats_document["number_words"] = number_words
9090

91+
repetitions_ratios = {
92+
n: round(Filtering.compute_repetitions_ratio(document, n), 4)
93+
for n in range(2, 16)
94+
}
95+
stats_document["repetitions_ratio"] = repetitions_ratios
96+
9197
special_characters_ratio = Filtering.compute_special_characters_ratio(
9298
document, self.param["special_characters"]
9399
)

ac_dc/visualization/visualization.py

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import base64
88
import json
99
import pandas as pd
10+
pd.options.mode.chained_assignment = None
1011

1112
import numpy as np
1213

@@ -73,16 +74,17 @@ def open_data(self):
7374
doc["text"][: self.max_len_text_display]
7475
+ " [...] [THIS LONG TEXT HAS BEEN TRUNCATED FOR DISPLAY REASONS]"
7576
)
76-
self.docs = pd.DataFrame(docs)
77+
self.docs_checkpoint = pd.DataFrame(docs)
78+
self.docs = self.docs_checkpoint
7779

7880
def set_title(self):
7981
st.title(f"{self.num_docs} {self.lang} documents with their stats.")
8082

8183
def filtering_of_docs(self):
8284
st.sidebar.subheader("Parameters of the filtering on documents")
8385

84-
def set_sliders(docs):
85-
columns = list(docs)
86+
def set_sliders():
87+
columns = list(self.docs)
8688
keys = []
8789
conds = {}
8890

@@ -99,7 +101,7 @@ def print_discared_by_cond(cond):
99101

100102
if "number_words" in columns:
101103
cutoff_def = "If the number of words of a document is lower than this number, the document is removed."
102-
max_nb_words = int(np.max(docs["number_words"])) + 1
104+
max_nb_words = int(np.max(self.docs["number_words"])) + 1
103105
cutoff_min_number_words = st.sidebar.slider(
104106
cutoff_def, 0, min(max_nb_words, 500), 0
105107
)
@@ -119,6 +121,46 @@ def print_discared_by_cond(cond):
119121

120122
conds["number_words"] = [cond_1, cond_2]
121123

124+
if "repetitions_ratio" in columns:
125+
val_repetitions_lengths = list(
126+
self.docs["repetitions_ratio"].iloc[0].keys()
127+
)
128+
default_index = (
129+
val_repetitions_lengths.index("10")
130+
if "10" in val_repetitions_lengths
131+
else 0
132+
)
133+
label_selectbox = (
134+
"Length of the repetitions (that will determine the repetitions ratio). "
135+
"Choosing a higher or lower number does not mean that the filtering "
136+
"is stronger or weaker. Be careful, choosing a low number (below 5 for languages like English) "
137+
"tends to associate a high repetitions ratio to very long documents (like book chapters), but with "
138+
"few or no repetitions, simply because their length gives them more diversity, and we do "
139+
"not want to discard such documents."
140+
)
141+
repetitions_length = st.sidebar.selectbox(
142+
label=label_selectbox,
143+
options=val_repetitions_lengths,
144+
index=default_index,
145+
)
146+
self.docs = self.docs_checkpoint
147+
for i in range(len(self.docs["repetitions_ratio"])):
148+
self.docs["repetitions_ratio"].iloc[i] = self.docs["repetitions_ratio"].iloc[i][repetitions_length]
149+
150+
cutoff_def = "If the repetitions ratio of a document is higher than this number, the document is removed."
151+
cutoff_repetitions_ratio = st.sidebar.slider(
152+
cutoff_def, 0.0, 1.0, 1.0, step=0.01
153+
)
154+
new_key = (
155+
"repetitions_ratio",
156+
cutoff_repetitions_ratio,
157+
True,
158+
)
159+
keys.append(new_key)
160+
cond = get_cond(new_key[0], new_key[1], new_key[2])
161+
print_discared_by_cond(cond)
162+
conds["repetitions_ratio"] = [cond]
163+
122164
if "special_characters_ratio" in columns:
123165
cutoff_def = "If the special characters ratio of a document is higher than this number, the document is removed."
124166
cutoff_special_characters_ratio = st.sidebar.slider(
@@ -169,7 +211,7 @@ def print_discared_by_cond(cond):
169211

170212
if "perplexity_score" in columns:
171213
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
172-
max_pp = int(np.max(docs["perplexity_score"])) + 1
214+
max_pp = int(np.max(self.docs["perplexity_score"])) + 1
173215
cutoff_perplexity_score = st.sidebar.slider(
174216
cutoff_def, 0, max_pp, max_pp
175217
)
@@ -181,7 +223,7 @@ def print_discared_by_cond(cond):
181223

182224
return keys, conds
183225

184-
self.keys, conds = set_sliders(self.docs)
226+
self.keys, conds = set_sliders()
185227

186228
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
187229
all_conds = np.all(all_conds, axis=0)
@@ -215,6 +257,13 @@ def display_dataset(cond, description):
215257
"Discarded documents for the filter on the number of words",
216258
)
217259

260+
if "repetitions_ratio" in columns:
261+
cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
262+
display_dataset(
263+
cond_filter,
264+
"Discarded documents for the filter on the repetitions ratio",
265+
)
266+
218267
if "special_characters_ratio" in columns:
219268
cond_filter = np.invert(
220269
np.all(conds["special_characters_ratio"], axis=0)

0 commit comments

Comments
 (0)