Skip to content

Commit 7afdeba

Browse files
authored
Added DupliPy main files.
1 parent dafc3dd commit 7afdeba

5 files changed

Lines changed: 527 additions & 0 deletions

File tree

source/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import duplipy
2+
from .formatting import remove_stopwords, remove_numbers, remove_whitespace, normalize_whitespace, separate_symbols, remove_special_characters, standardize_text, tokenize_text, stem_words, lemmatize_words, pos_tag
3+
from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, insert_synonym, paraphrase
4+
from .similarity import edit_distance_score, bleu_score
5+
from .text_analysis import analyze_sentiment

source/formatting.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
"""
2+
Formatting for text using common NLP techniques.
3+
4+
Available functions:
5+
- `remove_stopwords(text)`: Remove stopwords from the input text using NLTK's stopwords.
6+
- `remove_numbers(text)`: Remove numbers from the input text.
7+
- `remove_whitespace(text)`: Remove excess whitespace from the input text.
8+
- `normalize_whitespace(text)`: Normalize multiple whitespaces into a single whitespace in the input text.
9+
- `seperate_symbols(text)`: Separate symbols and words with a space to ease tokenization.
10+
- `remove_special_characters(text)`: Remove special characters from the input text.
11+
- `standardize_text(text)`: Standardize the formatting of the input text.
12+
- `tokenize_text(text)`: Tokenize the input text into individual words.
13+
- `stem_words(words)`: Stem the input words using Porter stemming algorithm.
14+
- `lemmatize_words(words)`: Lemmatize the input words using WordNet lemmatization.
15+
- `pos_tag(text)`: Perform part-of-speech (POS) tagging on the input text.
16+
"""
17+
18+
19+
import string
20+
import re
21+
import nltk
22+
from nltk.corpus import stopwords
23+
from nltk.tokenize import word_tokenize
24+
from nltk.stem import PorterStemmer, WordNetLemmatizer
25+
26+
nltk.download('stopwords', quiet=True)
27+
nltk.download('punkt', quiet=True)
28+
nltk.download('wordnet', quiet=True)
29+
nltk.download('averaged_perceptron_tagger', quiet=True)
30+
31+
def remove_stopwords(text):
32+
"""
33+
Remove stopwords from the input text using NLTK's stopwords.
34+
35+
Parameters:
36+
- `text` (str): The input text from which stopwords should be removed.
37+
38+
Returns:
39+
- `str`: The text without stopwords.
40+
"""
41+
try:
42+
stop_words = set(stopwords.words('english'))
43+
tokens = text.split()
44+
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
45+
filtered_text = ' '.join(filtered_tokens)
46+
return filtered_text
47+
except Exception as e:
48+
print(f"An error occurred during stopwords removal: {str(e)}")
49+
return text
50+
51+
def remove_numbers(text):
52+
"""
53+
Remove numbers from the input text.
54+
55+
Parameters:
56+
- `text` (str): The input text from which numbers should be removed.
57+
58+
Returns:
59+
- `str`: The text without numbers.
60+
"""
61+
try:
62+
text = re.sub(r'\d+', '', text)
63+
return text
64+
except Exception as e:
65+
print(f"An error occurred during number removal: {str(e)}")
66+
return text
67+
68+
def remove_whitespace(text):
69+
"""
70+
Remove excess whitespace from the input text.
71+
72+
Parameters:
73+
- `text` (str): The input text from which excess whitespace should be removed.
74+
75+
Returns:
76+
- `str`: The text with the removed excess whitespace.
77+
"""
78+
try:
79+
text = ' '.join(text.split())
80+
return text
81+
except Exception as e:
82+
print(f"An error occurred during whitespace removal: {str(e)}")
83+
return text
84+
85+
def normalize_whitespace(text):
86+
"""
87+
Normalize multiple whitespaces into a single whitespace in the input text.
88+
89+
Parameters:
90+
- `text` (str): The input text from which whitespace should be normalized.
91+
92+
Returns:
93+
- `str`: The text with normalized whitespace.
94+
"""
95+
try:
96+
text = re.sub(r'\s+', ' ', text)
97+
return text
98+
except Exception as e:
99+
print(f"An error occurred during whitespace normalization: {str(e)}")
100+
return text
101+
102+
def separate_symbols(text):
103+
"""
104+
Separate symbols and words with a space to ease tokenization.
105+
106+
Parameters:
107+
- `text` (str): The input text from which symbols needs to be seperated.
108+
109+
Returns:
110+
- `str`: The text from which symbols have been seperated.
111+
"""
112+
try:
113+
pattern = r"([\W])"
114+
separated_text = re.sub(pattern, r" \1 ", text)
115+
return separated_text
116+
except Exception as e:
117+
print(f"An error occurred during symbol separation: {str(e)}")
118+
return text
119+
120+
def remove_special_characters(text):
121+
"""
122+
Remove special characters from the input text.
123+
124+
Parameters:
125+
- `text` (str): The input text from which special characters should be removed.
126+
127+
Returns:
128+
- `str`: The text with special characters removed.
129+
"""
130+
try:
131+
text = text.translate(str.maketrans("", "", string.punctuation))
132+
special_characters = "@#$%^&*"
133+
text = ''.join(char for char in text if char not in special_characters)
134+
return text
135+
except Exception as e:
136+
print(f"An error occurred during special character removal: {str(e)}")
137+
return text
138+
139+
def standardize_text(text):
140+
"""
141+
Standardize the formatting of the input text.
142+
143+
Parameters:
144+
- `text` (str): The input text which needs to be standardized.
145+
146+
Returns:
147+
- `str`: The standardized text.
148+
"""
149+
try:
150+
text = text.lower()
151+
text = text.strip()
152+
return text
153+
except Exception as e:
154+
print(f"An error occurred during text standardization: {str(e)}")
155+
return text
156+
157+
def tokenize_text(text):
158+
"""
159+
Tokenize the input text into individual words.
160+
161+
Parameters:
162+
- `text` (str): The input text to be tokenized.
163+
164+
Returns:
165+
- `list`: A list of tokens (words) from the input text.
166+
"""
167+
tokens = word_tokenize(text)
168+
return tokens
169+
170+
def stem_words(words):
171+
"""
172+
Stem the input words using Porter stemming algorithm.
173+
174+
Parameters:
175+
- `words` (list): A list of words to be stemmed.
176+
177+
Returns:
178+
- `list`: A list of stemmed words.
179+
"""
180+
stemmer = PorterStemmer()
181+
stemmed_words = [stemmer.stem(word) for word in words]
182+
return stemmed_words
183+
184+
def lemmatize_words(words):
185+
"""
186+
Lemmatize the input words using WordNet lemmatization.
187+
188+
Parameters:
189+
- `words` (list): A list of words to be lemmatized.
190+
191+
Returns:
192+
- `list`: A list of lemmatized words.
193+
"""
194+
lemmatizer = WordNetLemmatizer()
195+
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
196+
return lemmatized_words
197+
198+
def pos_tag(text):
199+
"""
200+
Perform part-of-speech (POS) tagging on the input text.
201+
202+
Parameters:
203+
- `text` (str): The input text to be POS tagged.
204+
205+
Returns:
206+
- `list`: A list of tuples containing (word, tag) pairs.
207+
"""
208+
try:
209+
tokens = nltk.word_tokenize(text)
210+
tagged_words = nltk.pos_tag(tokens)
211+
return tagged_words
212+
except Exception as e:
213+
print(f"An error occurred during POS tagging: {str(e)}")
214+
return []

0 commit comments

Comments
 (0)