Skip to content

Commit 0e8546d

Browse files
committed
visualization: better preamble
1 parent 0cd6d47 commit 0e8546d

1 file changed

Lines changed: 34 additions & 20 deletions

File tree

ac_dc/visualization/visualization.py

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,6 @@ def warning_preamble(self):
7373
)
7474

7575
def preamble(self):
76-
st.markdown(
77-
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
78-
)
79-
8076
def get_binary_file_downloader_html(bin_file, file_label="File"):
8177
with open(bin_file, "rb") as f:
8278
data = f.read()
@@ -85,10 +81,11 @@ def get_binary_file_downloader_html(bin_file, file_label="File"):
8581
return href
8682

8783
st.markdown(
84+
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this " +
8885
get_binary_file_downloader_html(
8986
self.path_instructions,
90-
"Download the explanation of the filtering pipeline as pdf",
91-
),
87+
"pdf",
88+
) + ".",
9289
unsafe_allow_html=True,
9390
)
9491

@@ -123,15 +120,20 @@ def set_title(self):
123120

124121
@staticmethod
125122
def plot_hist(dataframe, key, num_bins=50):
126-
checkbox = st.checkbox("Diplay distribution", value=True, key=f"display_distribution_{key[0]}")
123+
checkbox = st.checkbox(
124+
"Diplay distribution", value=True, key=f"display_distribution_{key[0]}"
125+
)
127126
if checkbox:
128127
fig, ax = plt.subplots()
129128
val = dataframe[key[0]].values
130129
if np.median(val) != 0:
131-
val = val[abs(val - np.median(val)) < 9 * np.median(np.absolute(val - np.median(val)))]
130+
val = val[
131+
abs(val - np.median(val))
132+
< 9 * np.median(np.absolute(val - np.median(val)))
133+
]
132134
ax.hist(val, bins=num_bins, density=True)
133135
ax.set_title(" ".join(key[0].split("_")))
134-
ax.axvline(x=key[1], color='r', linestyle='dashed')
136+
ax.axvline(x=key[1], color="r", linestyle="dashed")
135137
st.pyplot(fig)
136138

137139
def filtering_of_docs(self):
@@ -281,9 +283,7 @@ def print_discared_by_cond(cond):
281283
with st.sidebar.expander("Perplexity score"):
282284
cutoff_def = "If the perplexity score of a document is higher than this number, the document is removed."
283285
max_pp = int(np.max(self.docs["perplexity_score"])) + 1
284-
cutoff_perplexity_score = st.slider(
285-
cutoff_def, 0, max_pp, max_pp
286-
)
286+
cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
287287
new_key = ("perplexity_score", cutoff_perplexity_score, True)
288288
keys.append(new_key)
289289
Visualization.plot_hist(self.docs, new_key)
@@ -299,8 +299,12 @@ def print_discared_by_cond(cond):
299299
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
300300
all_conds = np.all(all_conds, axis=0)
301301

302-
with st.expander(f"Filtering on documents, for {self.num_docs} {self.lang} documents"):
303-
st.header(f"Filtering on documents, for {self.num_docs} {self.lang} documents")
302+
with st.expander(
303+
f"Filtering on documents, for {self.num_docs} {self.lang} documents"
304+
):
305+
st.header(
306+
f"Filtering on documents, for {self.num_docs} {self.lang} documents"
307+
)
304308

305309
def display_dataset(cond, description):
306310
displayed_docs = self.docs.loc[cond]
@@ -353,7 +357,9 @@ def display_dataset(cond, description):
353357
)
354358

355359
if "flagged_words_ratio" in columns:
356-
cond_filter = np.invert(np.all(conds["flagged_words_ratio"], axis=0))
360+
cond_filter = np.invert(
361+
np.all(conds["flagged_words_ratio"], axis=0)
362+
)
357363
display_dataset(
358364
cond_filter,
359365
"Discarded documents for the filter on the flagged words ratio",
@@ -404,10 +410,16 @@ def filtering_of_words(self):
404410

405411
cond_words = self.words["len_word"] <= cutoff_word
406412
if incorrect_substrings:
407-
cond_words = cond_words & np.invert(self.words["incorrect_substring"])
413+
cond_words = cond_words & np.invert(
414+
self.words["incorrect_substring"]
415+
)
408416

409-
with st.expander(f"Filtering on words, for {self.num_docs} {self.lang} documents"):
410-
st.header(f"Filtering on words, for {self.num_docs} {self.lang} documents")
417+
with st.expander(
418+
f"Filtering on words, for {self.num_docs} {self.lang} documents"
419+
):
420+
st.header(
421+
f"Filtering on words, for {self.num_docs} {self.lang} documents"
422+
)
411423

412424
st.markdown(
413425
f"Since the number of words is way larger than the number of documents, "
@@ -514,7 +526,9 @@ def is_doc_discarded(key, score):
514526
)
515527
)
516528
special_characters_ratio = round(special_characters_ratio, 3)
517-
st.markdown(f"Special characters ratio: {special_characters_ratio}")
529+
st.markdown(
530+
f"Special characters ratio: {special_characters_ratio}"
531+
)
518532
if is_doc_discarded(key, special_characters_ratio):
519533
is_discarded = True
520534

@@ -581,7 +595,7 @@ def is_doc_discarded(key, score):
581595
)
582596

583597
def visualization(self):
584-
# self.warning_preamble()
598+
self.warning_preamble()
585599
self.preamble()
586600
self.open_data()
587601
self.set_title()

0 commit comments

Comments
 (0)