66- `remove_numbers(text)`: Remove numbers from the input text.
77- `remove_whitespace(text)`: Remove excess whitespace from the input text.
88- `normalize_whitespace(text)`: Normalize multiple whitespaces into a single whitespace in the input text.
9- - `seperate_symbols (text)`: Separate symbols and words with a space to ease tokenization.
9+ - `separate_symbols (text)`: Separate symbols and words with a space to ease tokenization.
1010- `remove_special_characters(text)`: Remove special characters from the input text.
1111- `standardize_text(text)`: Standardize the formatting of the input text.
1212- `tokenize_text(text)`: Tokenize the input text into individual words.
2727from nltk .tokenize import word_tokenize
2828from nltk .stem import PorterStemmer , WordNetLemmatizer
2929
30- def remove_stopwords (text ) :
30+ def remove_stopwords (text : str ) -> str :
3131 """
3232 Remove stopwords from the input text using NLTK's stopwords.
3333
@@ -51,7 +51,7 @@ def remove_stopwords(text):
5151 print (f"An error occurred during stopwords removal: { str (e )} " )
5252 return text
5353
54- def remove_numbers (text ) :
54+ def remove_numbers (text : str ) -> str :
5555 """
5656 Remove numbers from the input text.
5757
@@ -70,7 +70,7 @@ def remove_numbers(text):
7070 print (f"An error occurred during number removal: { str (e )} " )
7171 return text
7272
73- def remove_whitespace (text ) :
73+ def remove_whitespace (text : str ) -> str :
7474 """
7575 Remove excess whitespace from the input text.
7676
@@ -90,7 +90,7 @@ def remove_whitespace(text):
9090 print (f"An error occurred during whitespace removal: { str (e )} " )
9191 return text
9292
93- def normalize_whitespace (text ) :
93+ def normalize_whitespace (text : str ) -> str :
9494 """
9595 Normalize multiple whitespaces into a single whitespace in the input text.
9696
@@ -110,18 +110,18 @@ def normalize_whitespace(text):
110110 print (f"An error occurred during whitespace normalization: { str (e )} " )
111111 return text
112112
113- def separate_symbols (text ) :
113+ def separate_symbols (text : str ) -> str :
114114 """
115115 Separate symbols and words with a space to ease tokenization.
116116
117117 Symbols in the input text are separated from words with a space to facilitate
118118 easier tokenization and analysis of the text.
119119
120120 Parameters:
121- - `text` (str): The input text from which symbols needs to be seperated .
121+ - `text` (str): The input text from which symbols needs to be separated .
122122
123123 Returns:
124- - `str`: The text from which symbols have been seperated .
124+ - `str`: The text from which symbols have been separated .
125125 """
126126 try :
127127 pattern = r"([\W])"
@@ -131,7 +131,7 @@ def separate_symbols(text):
131131 print (f"An error occurred during symbol separation: { str (e )} " )
132132 return text
133133
134- def remove_special_characters (text ) :
134+ def remove_special_characters (text : str ) -> str :
135135 """
136136 Remove special characters from the input text.
137137
@@ -153,7 +153,7 @@ def remove_special_characters(text):
153153 print (f"An error occurred during special character removal: { str (e )} " )
154154 return text
155155
156- def standardize_text (text ) :
156+ def standardize_text (text : str ) -> str :
157157 """
158158 Standardize the formatting of the input text.
159159
@@ -174,7 +174,7 @@ def standardize_text(text):
174174 print (f"An error occurred during text standardization: { str (e )} " )
175175 return text
176176
177- def tokenize_text (text ) :
177+ def tokenize_text (text : str ) -> list [ str ] :
178178 """
179179 Tokenize the input text into individual words.
180180
@@ -186,48 +186,48 @@ def tokenize_text(text):
186186 - `text` (str): The input text to be tokenized.
187187
188188 Returns:
189- - `list`: A list of tokens (words) from the input text.
189+ - `list[str] `: A list of tokens (words) from the input text.
190190 """
191191 nltk .download ('punkt' , quiet = True )
192192 tokens = word_tokenize (text )
193193 return tokens
194194
195- def stem_words (words ) :
195+ def stem_words (words : list [ str ]) -> list [ str ] :
196196 """
197197 Stem the input words using Porter stemming algorithm.
198198
199199 Stemming reduces words to their base or root form, helping to consolidate
200200 variations of words and simplify text analysis.
201201
202202 Parameters:
203- - `words` (list): A list of words to be stemmed.
203+ - `words` (list[str] ): A list of words to be stemmed.
204204
205205 Returns:
206- - `list`: A list of stemmed words.
206+ - `list[str] `: A list of stemmed words.
207207 """
208208 stemmer = PorterStemmer ()
209209 stemmed_words = [stemmer .stem (word ) for word in words ]
210210 return stemmed_words
211211
212- def lemmatize_words (words ) :
212+ def lemmatize_words (words : list [ str ]) -> list [ str ] :
213213 """
214214 Lemmatize the input words using WordNet lemmatization.
215215
216216 Lemmatization reduces words to their base or dictionary form, helping to
217217 normalize variations and simplify text analysis.
218218
219219 Parameters:
220- - `words` (list): A list of words to be lemmatized.
220+ - `words` (list[str] ): A list of words to be lemmatized.
221221
222222 Returns:
223- - `list`: A list of lemmatized words.
223+ - `list[str] `: A list of lemmatized words.
224224 """
225225 nltk .download ('wordnet' , quiet = True )
226226 lemmatizer = WordNetLemmatizer ()
227227 lemmatized_words = [lemmatizer .lemmatize (word ) for word in words ]
228228 return lemmatized_words
229229
230- def pos_tag (text ) :
230+ def pos_tag (text : str ) -> list [ tuple [ str , str ]] :
231231 """
232232 Perform part-of-speech (POS) tagging on the input text.
233233
@@ -238,7 +238,7 @@ def pos_tag(text):
238238 - `text` (str): The input text to be POS tagged.
239239
240240 Returns:
241- - `list`: A list of tuples containing (word, tag) pairs.
241+ - `list[tuple[str, str]] `: A list of tuples containing (word, tag) pairs.
242242 """
243243 try :
244244 nltk .download ('punkt' , quiet = True )
@@ -250,7 +250,7 @@ def pos_tag(text):
250250 print (f"An error occurred during POS tagging: { str (e )} " )
251251 return []
252252
253- def remove_profanity_from_text (text ) :
253+ def remove_profanity_from_text (text : str ) -> str :
254254 """
255255 Remove profane words from the input text.
256256
@@ -260,7 +260,7 @@ def remove_profanity_from_text(text):
260260 - `text` (str): The input text to remove profanity from.
261261
262262 Returns:
263- - `text` ( str) : The cleaned output text.
263+ - `str` : The cleaned output text.
264264 """
265265 nltk .download ('punkt' , quiet = True )
266266 sentences = nltk .sent_tokenize (text )
@@ -269,7 +269,7 @@ def remove_profanity_from_text(text):
269269
270270 return cleaned_text
271271
272- def remove_sensitive_info_from_text (text ) :
272+ def remove_sensitive_info_from_text (text : str ) -> str :
273273 """
274274 Remove sensitive information from the input text.
275275
@@ -279,7 +279,7 @@ def remove_sensitive_info_from_text(text):
279279 - `text` (str): The input text to remove sensitive information from.
280280
281281 Returns:
282- - `text` ( str) : The cleaned output text.
282+ - `str` : The cleaned output text.
283283 """
284284 nltk .download ('punkt' , quiet = True )
285285 sentences = nltk .sent_tokenize (text )
@@ -288,7 +288,7 @@ def remove_sensitive_info_from_text(text):
288288
289289 return cleaned_text
290290
291- def remove_hate_speech_from_text (text ) :
291+ def remove_hate_speech_from_text (text : str ) -> str :
292292 """
293293 Remove hate speech or offensive speech from the input text.
294294
@@ -298,7 +298,7 @@ def remove_hate_speech_from_text(text):
298298 - `text` (str): The input text to remove hate speech and offensive speech from.
299299
300300 Returns:
301- - `text` ( str) : The cleaned output text.
301+ - `str` : The cleaned output text.
302302 """
303303 nltk .download ('punkt' , quiet = True )
304304 sentences = nltk .sent_tokenize (text )
@@ -311,7 +311,7 @@ def remove_hate_speech_from_text(text):
311311
312312 return cleaned_text
313313
314- def post_format_text (text ) :
314+ def post_format_text (text : str ) -> str :
315315 """
316316 Post-format the text using regex.
317317
@@ -328,4 +328,4 @@ def post_format_text(text):
328328 text = re .sub (r'\s+' , ' ' , text )
329329 # Ensure proper punctuation spacing
330330 text = re .sub (r'\s([.,!?;:])' , r'\1' , text )
331- return text
331+ return text
0 commit comments