Skip to content

Commit 18ced62

Browse files
authored
Merge pull request #2 from Infinitode/update-duplipy-0.2.6-csv-numerical-augmentation-2544036806862479686
Update DupliPy to 0.2.6: CSV and Numerical Augmentation
2 parents 6ab152f + 59ed8d4 commit 18ced62

6 files changed

Lines changed: 378 additions & 93 deletions

File tree

duplipy/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import duplipy
22
from .formatting import remove_stopwords, remove_numbers, remove_whitespace, normalize_whitespace, separate_symbols, remove_special_characters, standardize_text, tokenize_text, stem_words, lemmatize_words, pos_tag, remove_profanity_from_text, remove_sensitive_info_from_text, remove_hate_speech_from_text, post_format_text
3-
from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, random_word_deletion, swap_random_words, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop, shuffle_words, random_flip, random_color_jitter, noise_overlay
3+
from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, random_word_deletion, swap_random_words, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop, shuffle_words, random_flip, random_color_jitter, noise_overlay, add_noise, scale_data, shift_data, augment_time_series, balance_dataset, augment_csv_data
44
from .similarity import edit_distance_score, bleu_score, jaccard_similarity_score, sorensen_dice_coefficient, cosine_similarity_score, mean_squared_error, psnr
5-
from .text_analysis import analyze_sentiment, named_entity_recognition
5+
from .text_analysis import analyze_sentiment, named_entity_recognition

duplipy/formatting.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
- `remove_numbers(text)`: Remove numbers from the input text.
77
- `remove_whitespace(text)`: Remove excess whitespace from the input text.
88
- `normalize_whitespace(text)`: Normalize multiple whitespaces into a single whitespace in the input text.
9-
- `seperate_symbols(text)`: Separate symbols and words with a space to ease tokenization.
9+
- `separate_symbols(text)`: Separate symbols and words with a space to ease tokenization.
1010
- `remove_special_characters(text)`: Remove special characters from the input text.
1111
- `standardize_text(text)`: Standardize the formatting of the input text.
1212
- `tokenize_text(text)`: Tokenize the input text into individual words.
@@ -27,7 +27,7 @@
2727
from nltk.tokenize import word_tokenize
2828
from nltk.stem import PorterStemmer, WordNetLemmatizer
2929

30-
def remove_stopwords(text):
30+
def remove_stopwords(text: str) -> str:
3131
"""
3232
Remove stopwords from the input text using NLTK's stopwords.
3333
@@ -51,7 +51,7 @@ def remove_stopwords(text):
5151
print(f"An error occurred during stopwords removal: {str(e)}")
5252
return text
5353

54-
def remove_numbers(text):
54+
def remove_numbers(text: str) -> str:
5555
"""
5656
Remove numbers from the input text.
5757
@@ -70,7 +70,7 @@ def remove_numbers(text):
7070
print(f"An error occurred during number removal: {str(e)}")
7171
return text
7272

73-
def remove_whitespace(text):
73+
def remove_whitespace(text: str) -> str:
7474
"""
7575
Remove excess whitespace from the input text.
7676
@@ -90,7 +90,7 @@ def remove_whitespace(text):
9090
print(f"An error occurred during whitespace removal: {str(e)}")
9191
return text
9292

93-
def normalize_whitespace(text):
93+
def normalize_whitespace(text: str) -> str:
9494
"""
9595
Normalize multiple whitespaces into a single whitespace in the input text.
9696
@@ -110,18 +110,18 @@ def normalize_whitespace(text):
110110
print(f"An error occurred during whitespace normalization: {str(e)}")
111111
return text
112112

113-
def separate_symbols(text):
113+
def separate_symbols(text: str) -> str:
114114
"""
115115
Separate symbols and words with a space to ease tokenization.
116116
117117
Symbols in the input text are separated from words with a space to facilitate
118118
easier tokenization and analysis of the text.
119119
120120
Parameters:
121-
- `text` (str): The input text from which symbols needs to be seperated.
121+
- `text` (str): The input text from which symbols needs to be separated.
122122
123123
Returns:
124-
- `str`: The text from which symbols have been seperated.
124+
- `str`: The text from which symbols have been separated.
125125
"""
126126
try:
127127
pattern = r"([\W])"
@@ -131,7 +131,7 @@ def separate_symbols(text):
131131
print(f"An error occurred during symbol separation: {str(e)}")
132132
return text
133133

134-
def remove_special_characters(text):
134+
def remove_special_characters(text: str) -> str:
135135
"""
136136
Remove special characters from the input text.
137137
@@ -153,7 +153,7 @@ def remove_special_characters(text):
153153
print(f"An error occurred during special character removal: {str(e)}")
154154
return text
155155

156-
def standardize_text(text):
156+
def standardize_text(text: str) -> str:
157157
"""
158158
Standardize the formatting of the input text.
159159
@@ -174,7 +174,7 @@ def standardize_text(text):
174174
print(f"An error occurred during text standardization: {str(e)}")
175175
return text
176176

177-
def tokenize_text(text):
177+
def tokenize_text(text: str) -> list[str]:
178178
"""
179179
Tokenize the input text into individual words.
180180
@@ -186,48 +186,48 @@ def tokenize_text(text):
186186
- `text` (str): The input text to be tokenized.
187187
188188
Returns:
189-
- `list`: A list of tokens (words) from the input text.
189+
- `list[str]`: A list of tokens (words) from the input text.
190190
"""
191191
nltk.download('punkt', quiet=True)
192192
tokens = word_tokenize(text)
193193
return tokens
194194

195-
def stem_words(words):
195+
def stem_words(words: list[str]) -> list[str]:
196196
"""
197197
Stem the input words using Porter stemming algorithm.
198198
199199
Stemming reduces words to their base or root form, helping to consolidate
200200
variations of words and simplify text analysis.
201201
202202
Parameters:
203-
- `words` (list): A list of words to be stemmed.
203+
- `words` (list[str]): A list of words to be stemmed.
204204
205205
Returns:
206-
- `list`: A list of stemmed words.
206+
- `list[str]`: A list of stemmed words.
207207
"""
208208
stemmer = PorterStemmer()
209209
stemmed_words = [stemmer.stem(word) for word in words]
210210
return stemmed_words
211211

212-
def lemmatize_words(words):
212+
def lemmatize_words(words: list[str]) -> list[str]:
213213
"""
214214
Lemmatize the input words using WordNet lemmatization.
215215
216216
Lemmatization reduces words to their base or dictionary form, helping to
217217
normalize variations and simplify text analysis.
218218
219219
Parameters:
220-
- `words` (list): A list of words to be lemmatized.
220+
- `words` (list[str]): A list of words to be lemmatized.
221221
222222
Returns:
223-
- `list`: A list of lemmatized words.
223+
- `list[str]`: A list of lemmatized words.
224224
"""
225225
nltk.download('wordnet', quiet=True)
226226
lemmatizer = WordNetLemmatizer()
227227
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
228228
return lemmatized_words
229229

230-
def pos_tag(text):
230+
def pos_tag(text: str) -> list[tuple[str, str]]:
231231
"""
232232
Perform part-of-speech (POS) tagging on the input text.
233233
@@ -238,7 +238,7 @@ def pos_tag(text):
238238
- `text` (str): The input text to be POS tagged.
239239
240240
Returns:
241-
- `list`: A list of tuples containing (word, tag) pairs.
241+
- `list[tuple[str, str]]`: A list of tuples containing (word, tag) pairs.
242242
"""
243243
try:
244244
nltk.download('punkt', quiet=True)
@@ -250,7 +250,7 @@ def pos_tag(text):
250250
print(f"An error occurred during POS tagging: {str(e)}")
251251
return []
252252

253-
def remove_profanity_from_text(text):
253+
def remove_profanity_from_text(text: str) -> str:
254254
"""
255255
Remove profane words from the input text.
256256
@@ -260,7 +260,7 @@ def remove_profanity_from_text(text):
260260
- `text` (str): The input text to remove profanity from.
261261
262262
Returns:
263-
- `text` (str): The cleaned output text.
263+
- `str`: The cleaned output text.
264264
"""
265265
nltk.download('punkt', quiet=True)
266266
sentences = nltk.sent_tokenize(text)
@@ -269,7 +269,7 @@ def remove_profanity_from_text(text):
269269

270270
return cleaned_text
271271

272-
def remove_sensitive_info_from_text(text):
272+
def remove_sensitive_info_from_text(text: str) -> str:
273273
"""
274274
Remove sensitive information from the input text.
275275
@@ -279,7 +279,7 @@ def remove_sensitive_info_from_text(text):
279279
- `text` (str): The input text to remove sensitive information from.
280280
281281
Returns:
282-
- `text` (str): The cleaned output text.
282+
- `str`: The cleaned output text.
283283
"""
284284
nltk.download('punkt', quiet=True)
285285
sentences = nltk.sent_tokenize(text)
@@ -288,7 +288,7 @@ def remove_sensitive_info_from_text(text):
288288

289289
return cleaned_text
290290

291-
def remove_hate_speech_from_text(text):
291+
def remove_hate_speech_from_text(text: str) -> str:
292292
"""
293293
Remove hate speech or offensive speech from the input text.
294294
@@ -298,7 +298,7 @@ def remove_hate_speech_from_text(text):
298298
- `text` (str): The input text to remove hate speech and offensive speech from.
299299
300300
Returns:
301-
- `text` (str): The cleaned output text.
301+
- `str`: The cleaned output text.
302302
"""
303303
nltk.download('punkt', quiet=True)
304304
sentences = nltk.sent_tokenize(text)
@@ -311,7 +311,7 @@ def remove_hate_speech_from_text(text):
311311

312312
return cleaned_text
313313

314-
def post_format_text(text):
314+
def post_format_text(text: str) -> str:
315315
"""
316316
Post-format the text using regex.
317317
@@ -328,4 +328,4 @@ def post_format_text(text):
328328
text = re.sub(r'\s+', ' ', text)
329329
# Ensure proper punctuation spacing
330330
text = re.sub(r'\s([.,!?;:])', r'\1', text)
331-
return text
331+
return text

0 commit comments

Comments
 (0)