Skip to content

Commit 9174c6b

Browse files
committed
visualization: moved distributions of the parameters to the sidebar
1 parent 5fc2126 commit 9174c6b

1 file changed

Lines changed: 23 additions & 25 deletions

File tree

ac_dc/visualization/visualization.py

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,19 @@ def open_data(self):
121121
def set_title(self):
122122
st.title(f"{self.num_docs} {self.lang} documents with their stats.")
123123

124+
@staticmethod
125+
def plot_hist(dataframe, key, num_bins=50):
126+
checkbox = st.sidebar.checkbox("Diplay distribution", key=f"display_distribution_{key[0]}")
127+
if checkbox:
128+
fig, ax = plt.subplots()
129+
val = dataframe[key[0]].values
130+
if np.median(val) != 0:
131+
val = val[abs(val - np.median(val)) < 6 * np.median(np.absolute(val - np.median(val)))]
132+
ax.hist(val, bins=num_bins)
133+
ax.set_title(" ".join(key[0].split("_")))
134+
ax.axvline(x=key[1], color='r', linestyle='dashed')
135+
st.sidebar.pyplot(fig)
136+
124137
def filtering_of_docs(self):
125138
st.sidebar.subheader("Parameters of the filtering on documents")
126139

@@ -148,6 +161,7 @@ def print_discared_by_cond(cond):
148161
)
149162
new_key = ("number_words", cutoff_min_number_words, False)
150163
keys.append(new_key)
164+
Visualization.plot_hist(self.docs, new_key)
151165
cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
152166
print_discared_by_cond(cond_1)
153167

@@ -201,6 +215,7 @@ def print_discared_by_cond(cond):
201215
repetitions_length,
202216
)
203217
keys.append(new_key)
218+
Visualization.plot_hist(self.docs, new_key)
204219
cond = get_cond(new_key[0], new_key[1], new_key[2])
205220
print_discared_by_cond(cond)
206221
conds["repetitions_ratio"] = [cond]
@@ -216,6 +231,7 @@ def print_discared_by_cond(cond):
216231
True,
217232
)
218233
keys.append(new_key)
234+
Visualization.plot_hist(self.docs, new_key)
219235
cond = get_cond(new_key[0], new_key[1], new_key[2])
220236
print_discared_by_cond(cond)
221237
conds["special_characters_ratio"] = [cond]
@@ -227,6 +243,7 @@ def print_discared_by_cond(cond):
227243
)
228244
new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
229245
keys.append(new_key)
246+
Visualization.plot_hist(self.docs, new_key)
230247
cond = get_cond(new_key[0], new_key[1], new_key[2])
231248
print_discared_by_cond(cond)
232249
conds["stopwords_ratio"] = [cond]
@@ -238,6 +255,7 @@ def print_discared_by_cond(cond):
238255
)
239256
new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
240257
keys.append(new_key)
258+
Visualization.plot_hist(self.docs, new_key)
241259
cond = get_cond(new_key[0], new_key[1], new_key[2])
242260
print_discared_by_cond(cond)
243261
conds["flagged_words_ratio"] = [cond]
@@ -249,6 +267,7 @@ def print_discared_by_cond(cond):
249267
)
250268
new_key = ("lang_id_score", cutoff_lang_id_score, False)
251269
keys.append(new_key)
270+
Visualization.plot_hist(self.docs, new_key)
252271
cond = get_cond(new_key[0], new_key[1], new_key[2])
253272
print_discared_by_cond(cond)
254273
conds["lang_id_score"] = [cond]
@@ -261,6 +280,7 @@ def print_discared_by_cond(cond):
261280
)
262281
new_key = ("perplexity_score", cutoff_perplexity_score, True)
263282
keys.append(new_key)
283+
Visualization.plot_hist(self.docs, new_key)
264284
cond = get_cond(new_key[0], new_key[1], new_key[2])
265285
print_discared_by_cond(cond)
266286
conds["perplexity_score"] = [cond]
@@ -355,7 +375,9 @@ def filtering_of_words(self):
355375
cutoff_def = "If the length of a word is higher than this number, the word is removed."
356376
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
357377
cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
358-
self.parameters.append(("len_word", cutoff_word, True))
378+
new_key = ("len_word", cutoff_word, True)
379+
self.parameters.append(new_key)
380+
Visualization.plot_hist(self.words, new_key)
359381
st.sidebar.caption("---------")
360382

361383
incorrect_substrings = st.sidebar.checkbox(
@@ -400,29 +422,6 @@ def download_parameters(self):
400422
file_name=f"parameters_{self.lang_dataset_id}.json",
401423
)
402424

403-
def plot_distributions_filtering_parameters(self):
404-
st.header("Distributions of the filtering parameters")
405-
406-
display_distributions = st.checkbox("Display distributions")
407-
408-
if display_distributions:
409-
410-
def plot_hist(dataframe, key, num_bins=50):
411-
st.subheader(" ".join(key.split("_")))
412-
hist_values = dataframe[key].values
413-
max_range = np.max(hist_values)
414-
hist_values = np.histogram(
415-
hist_values, bins=num_bins, range=(0, max_range)
416-
)[0]
417-
st.bar_chart(hist_values)
418-
st.markdown(f"Each bin is of size: {max_range/num_bins}.")
419-
420-
for key in list({el[0]: None for el in self.keys}):
421-
plot_hist(self.docs, key)
422-
423-
if not (self.words is None):
424-
plot_hist(self.words, "len_word")
425-
426425
def plot_zipf_law(self):
427426
if not (self.words is None):
428427
st.header("Zipf's Law")
@@ -578,7 +577,6 @@ def visualization(self):
578577
self.filtering_of_docs()
579578
self.filtering_of_words()
580579
self.download_parameters()
581-
self.plot_distributions_filtering_parameters()
582580
self.plot_zipf_law()
583581
self.analyse_personal_doc()
584582
self.download_data()

0 commit comments

Comments
 (0)