1+ """
2+ Formatting for text using common NLP techniques.
3+
4+ Available functions:
5+ - `remove_stopwords(text)`: Remove stopwords from the input text using NLTK's stopwords.
6+ - `remove_numbers(text)`: Remove numbers from the input text.
7+ - `remove_whitespace(text)`: Remove excess whitespace from the input text.
8+ - `normalize_whitespace(text)`: Normalize multiple whitespaces into a single whitespace in the input text.
9+ - `seperate_symbols(text)`: Separate symbols and words with a space to ease tokenization.
10+ - `remove_special_characters(text)`: Remove special characters from the input text.
11+ - `standardize_text(text)`: Standardize the formatting of the input text.
12+ - `tokenize_text(text)`: Tokenize the input text into individual words.
13+ - `stem_words(words)`: Stem the input words using Porter stemming algorithm.
14+ - `lemmatize_words(words)`: Lemmatize the input words using WordNet lemmatization.
15+ - `pos_tag(text)`: Perform part-of-speech (POS) tagging on the input text.
16+ """
17+
18+
19+ import string
20+ import re
21+ import nltk
22+ from nltk .corpus import stopwords
23+ from nltk .tokenize import word_tokenize
24+ from nltk .stem import PorterStemmer , WordNetLemmatizer
25+
26+ nltk .download ('stopwords' , quiet = True )
27+ nltk .download ('punkt' , quiet = True )
28+ nltk .download ('wordnet' , quiet = True )
29+ nltk .download ('averaged_perceptron_tagger' , quiet = True )
30+
31+ def remove_stopwords (text ):
32+ """
33+ Remove stopwords from the input text using NLTK's stopwords.
34+
35+ Parameters:
36+ - `text` (str): The input text from which stopwords should be removed.
37+
38+ Returns:
39+ - `str`: The text without stopwords.
40+ """
41+ try :
42+ stop_words = set (stopwords .words ('english' ))
43+ tokens = text .split ()
44+ filtered_tokens = [token for token in tokens if token .lower () not in stop_words ]
45+ filtered_text = ' ' .join (filtered_tokens )
46+ return filtered_text
47+ except Exception as e :
48+ print (f"An error occurred during stopwords removal: { str (e )} " )
49+ return text
50+
51+ def remove_numbers (text ):
52+ """
53+ Remove numbers from the input text.
54+
55+ Parameters:
56+ - `text` (str): The input text from which numbers should be removed.
57+
58+ Returns:
59+ - `str`: The text without numbers.
60+ """
61+ try :
62+ text = re .sub (r'\d+' , '' , text )
63+ return text
64+ except Exception as e :
65+ print (f"An error occurred during number removal: { str (e )} " )
66+ return text
67+
68+ def remove_whitespace (text ):
69+ """
70+ Remove excess whitespace from the input text.
71+
72+ Parameters:
73+ - `text` (str): The input text from which excess whitespace should be removed.
74+
75+ Returns:
76+ - `str`: The text with the removed excess whitespace.
77+ """
78+ try :
79+ text = ' ' .join (text .split ())
80+ return text
81+ except Exception as e :
82+ print (f"An error occurred during whitespace removal: { str (e )} " )
83+ return text
84+
85+ def normalize_whitespace (text ):
86+ """
87+ Normalize multiple whitespaces into a single whitespace in the input text.
88+
89+ Parameters:
90+ - `text` (str): The input text from which whitespace should be normalized.
91+
92+ Returns:
93+ - `str`: The text with normalized whitespace.
94+ """
95+ try :
96+ text = re .sub (r'\s+' , ' ' , text )
97+ return text
98+ except Exception as e :
99+ print (f"An error occurred during whitespace normalization: { str (e )} " )
100+ return text
101+
102+ def separate_symbols (text ):
103+ """
104+ Separate symbols and words with a space to ease tokenization.
105+
106+ Parameters:
107+ - `text` (str): The input text from which symbols needs to be seperated.
108+
109+ Returns:
110+ - `str`: The text from which symbols have been seperated.
111+ """
112+ try :
113+ pattern = r"([\W])"
114+ separated_text = re .sub (pattern , r" \1 " , text )
115+ return separated_text
116+ except Exception as e :
117+ print (f"An error occurred during symbol separation: { str (e )} " )
118+ return text
119+
120+ def remove_special_characters (text ):
121+ """
122+ Remove special characters from the input text.
123+
124+ Parameters:
125+ - `text` (str): The input text from which special characters should be removed.
126+
127+ Returns:
128+ - `str`: The text with special characters removed.
129+ """
130+ try :
131+ text = text .translate (str .maketrans ("" , "" , string .punctuation ))
132+ special_characters = "@#$%^&*"
133+ text = '' .join (char for char in text if char not in special_characters )
134+ return text
135+ except Exception as e :
136+ print (f"An error occurred during special character removal: { str (e )} " )
137+ return text
138+
139+ def standardize_text (text ):
140+ """
141+ Standardize the formatting of the input text.
142+
143+ Parameters:
144+ - `text` (str): The input text which needs to be standardized.
145+
146+ Returns:
147+ - `str`: The standardized text.
148+ """
149+ try :
150+ text = text .lower ()
151+ text = text .strip ()
152+ return text
153+ except Exception as e :
154+ print (f"An error occurred during text standardization: { str (e )} " )
155+ return text
156+
157+ def tokenize_text (text ):
158+ """
159+ Tokenize the input text into individual words.
160+
161+ Parameters:
162+ - `text` (str): The input text to be tokenized.
163+
164+ Returns:
165+ - `list`: A list of tokens (words) from the input text.
166+ """
167+ tokens = word_tokenize (text )
168+ return tokens
169+
170+ def stem_words (words ):
171+ """
172+ Stem the input words using Porter stemming algorithm.
173+
174+ Parameters:
175+ - `words` (list): A list of words to be stemmed.
176+
177+ Returns:
178+ - `list`: A list of stemmed words.
179+ """
180+ stemmer = PorterStemmer ()
181+ stemmed_words = [stemmer .stem (word ) for word in words ]
182+ return stemmed_words
183+
184+ def lemmatize_words (words ):
185+ """
186+ Lemmatize the input words using WordNet lemmatization.
187+
188+ Parameters:
189+ - `words` (list): A list of words to be lemmatized.
190+
191+ Returns:
192+ - `list`: A list of lemmatized words.
193+ """
194+ lemmatizer = WordNetLemmatizer ()
195+ lemmatized_words = [lemmatizer .lemmatize (word ) for word in words ]
196+ return lemmatized_words
197+
198+ def pos_tag (text ):
199+ """
200+ Perform part-of-speech (POS) tagging on the input text.
201+
202+ Parameters:
203+ - `text` (str): The input text to be POS tagged.
204+
205+ Returns:
206+ - `list`: A list of tuples containing (word, tag) pairs.
207+ """
208+ try :
209+ tokens = nltk .word_tokenize (text )
210+ tagged_words = nltk .pos_tag (tokens )
211+ return tagged_words
212+ except Exception as e :
213+ print (f"An error occurred during POS tagging: { str (e )} " )
214+ return []
0 commit comments