Updated docstrings, added new methods

Infinitode · Infinitode · commit 00c2d0b01123 · 2023-12-13T13:15:47.000+02:00
We updated the docstrings for all of the functions, with new and improved descriptions and usages. We also added new functions to replication and similarity.
diff --git a/duplipy/__init__.py b/duplipy/__init__.py
@@ -1,5 +1,5 @@
-import source
+import duplipy
 from .formatting import remove_stopwords, remove_numbers, remove_whitespace, normalize_whitespace, separate_symbols, remove_special_characters, standardize_text, tokenize_text, stem_words, lemmatize_words, pos_tag
-from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop
-from .similarity import edit_distance_score, bleu_score
+from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop, shuffle_words
+from .similarity import edit_distance_score, bleu_score, jaccard_similarity_score
 from .text_analysis import analyze_sentiment
diff --git a/duplipy/formatting.py b/duplipy/formatting.py
@@ -32,6 +32,9 @@ def remove_stopwords(text):
     """
     Remove stopwords from the input text using NLTK's stopwords.
 
+    Stopwords are frequently used words (e.g., 'the', 'and', 'is') that are often
+    excluded from text processing to focus on more meaningful content.
+    
     Parameters:
     - `text` (str): The input text from which stopwords should be removed.
 
@@ -52,6 +55,8 @@ def remove_numbers(text):
     """
     Remove numbers from the input text.
 
+    Numerical digits are removed from the text to focus on the non-numeric content.
+    
     Parameters:
     - `text` (str): The input text from which numbers should be removed.
 
@@ -69,6 +74,9 @@ def remove_whitespace(text):
     """
     Remove excess whitespace from the input text.
 
+    Excess whitespace, including leading, trailing, and multiple consecutive spaces,
+    is removed from the text to create a more standardized and readable format.
+    
     Parameters:
     - `text` (str): The input text from which excess whitespace should be removed.
 
@@ -86,6 +94,9 @@ def normalize_whitespace(text):
     """
     Normalize multiple whitespaces into a single whitespace in the input text.
 
+    Multiple consecutive whitespaces are replaced with a single whitespace to
+    create a more consistent and readable text format.
+    
     Parameters:
     - `text` (str): The input text from which whitespace should be normalized.
 
@@ -103,6 +114,9 @@ def separate_symbols(text):
     """
     Separate symbols and words with a space to ease tokenization.
 
+    Symbols in the input text are separated from words with a space to facilitate
+    easier tokenization and analysis of the text.
+    
     Parameters:
     - `text` (str): The input text from which symbols needs to be seperated.
 
@@ -121,6 +135,9 @@ def remove_special_characters(text):
     """
     Remove special characters from the input text.
 
+    Special characters, such as punctuation and user-defined symbols, are removed
+    to create a text without these non-alphanumeric elements.
+    
     Parameters:
     - `text` (str): The input text from which special characters should be removed.
 
@@ -140,6 +157,9 @@ def standardize_text(text):
     """
     Standardize the formatting of the input text.
 
+    The input text is converted to lowercase and leading/trailing whitespaces are removed
+    to create a standardized representation for easier comparison and analysis.
+
     Parameters:
     - `text` (str): The input text which needs to be standardized.
 
@@ -158,6 +178,10 @@ def tokenize_text(text):
     """
     Tokenize the input text into individual words.
 
+    Tokenization is the process of breaking down a text into individual words, 
+    facilitating further analysis, such as counting word frequencies or analyzing 
+    language patterns.
+    
     Parameters:
     - `text` (str): The input text to be tokenized.
 
@@ -171,6 +195,9 @@ def stem_words(words):
     """
     Stem the input words using Porter stemming algorithm.
 
+    Stemming reduces words to their base or root form, helping to consolidate 
+    variations of words and simplify text analysis.
+
     Parameters:
     - `words` (list): A list of words to be stemmed.
 
@@ -185,6 +212,9 @@ def lemmatize_words(words):
     """
     Lemmatize the input words using WordNet lemmatization.
 
+    Lemmatization reduces words to their base or dictionary form, helping to 
+    normalize variations and simplify text analysis.
+    
     Parameters:
     - `words` (list): A list of words to be lemmatized.
 
@@ -199,6 +229,9 @@ def pos_tag(text):
     """
     Perform part-of-speech (POS) tagging on the input text.
 
+    Part-of-speech tagging assigns a grammatical category (tag) to each word 
+    in a text, aiding in syntactic analysis and understanding sentence structure.
+    
     Parameters:
     - `text` (str): The input text to be POS tagged.
 
diff --git a/duplipy/replication.py b/duplipy/replication.py
@@ -17,13 +17,15 @@
 - `resize(image, size)`: Resize the input image to the specified size.
 - `crop(image, box)`: Crop the input image to the specified rectangular region.
 - `random_crop(image, size)`: Randomly crop a region from the input image.
+- `shuffle_words(text)`: Randomly shuffle the order of words in each sentence.
 """
 
 import random
 import time
 import nltk
 from nltk.corpus import wordnet
 from PIL import Image
+import tqdm
 
 nltk.download("wordnet", quiet=True)
 nltk.download("averaged_perceptron_tagger", quiet=True)
@@ -33,6 +35,9 @@ def replace_word_with_synonym(word):
     """
     Replace the given word with a synonym.
 
+    Synonyms are alternative words with similar meanings, and replacing words
+    with synonyms can be used for text augmentation or variation.
+    
     Params:
     - `word` (str): The input word to replace with a synonym.
 
@@ -155,6 +160,9 @@ def insert_random_word(text, word):
     """
     Insert a random word into the input text.
 
+    This function randomly inserts a specified word into the input text, creating
+    variations for text augmentation or diversification.
+
     Parameters:
     - `text` (str): The input text for word insertion.
     - `word` (str): The word to be inserted into the text.
@@ -176,6 +184,9 @@ def delete_random_word(text):
     """
     Delete a random word from the input text.
 
+    This function randomly deletes a word from the input text, creating variations
+    for text augmentation or diversity.
+    
     Parameters:
     - `text` (str): The input text for word deletion.
 
@@ -197,6 +208,9 @@ def insert_synonym(text, word):
     """
     Insert a synonym of the given word into the input text.
 
+    This function replaces the specified word in the input text with a synonym,
+    introducing variations for text augmentation or diversity.
+    
     Parameters:
     - `text` (str): The input text for synonym insertion.
     - `word` (str): The word for which a synonym will be inserted.
@@ -217,6 +231,9 @@ def paraphrase(text):
     """
     Paraphrase the input text.
 
+    This function leverages part-of-speech tagging to identify verbs (VB), nouns (NN),
+    and adjectives (JJ) in the input text, replacing them with synonyms for paraphrasing.
+    
     Parameters:
     - `text` (str): The input text to be paraphrased.
 
@@ -327,4 +344,30 @@ def random_crop(image, size):
     upper = random.randint(0, height - size[1])
     right = left + size[0]
     lower = upper + size[1]
-    return crop(image, (left, upper, right, lower))
+    return crop(image, (left, upper, right, lower))
+
+# DupliPy 0.2.0
+
+def shuffle_words(text):
+    """
+    Randomly shuffle the order of words in each sentence.
+
+    This function takes a list of sentences and randomly shuffles the order of words
+    in each sentence, creating variations for text augmentation or diversity.
+    
+    Parameters:
+    - `text` (list of str): List of sentences where each sentence's words needs to be shuffled.
+
+    Returns:
+    - `list of str`: List of sentences with randomly shuffled words.
+    """
+    # Shuffle the order of words in each sentence
+    shuffled_text = []
+    with tqdm(total=len(text), desc="Shuffling Words") as pbar:
+        for sentence in text:
+            words = sentence.split()
+            shuffled_words = random.sample(words, len(words))
+            shuffled_sentence = ' '.join(shuffled_words)
+            shuffled_text.append(shuffled_sentence)
+            pbar.update(1)
+    return shuffled_text
diff --git a/duplipy/similarity.py b/duplipy/similarity.py
@@ -3,6 +3,8 @@
 
 Available functions:
 - `edit_distance_score(text1, text2)`: Calculate the edit distance score between two texts.
+- `bleu_score(reference, candidate)`: Calculate the BLEU score between a reference sentence and a candidate sentence.
+- `jaccard_similarity_score(text1, text2)`: Calculate Jaccard similarity between two texts.
 """
 
 import nltk
@@ -13,12 +15,17 @@ def edit_distance_score(text1, text2):
     """
     Calculate the edit distance score between two texts.
 
+    The edit distance, also known as Levenshtein distance, is a measure of the
+    minimum number of single-character edits (insertions, deletions, or
+    substitutions) required to transform one text into another.
+
     Parameters:
     - `text1` (str): The first text.
     - `text2` (str): The second text.
 
     Returns:
-    - `int`: The edit distance score.
+    - `int`: The edit distance score between the two texts. A lower score
+      indicates greater similarity, with 0 meaning the texts are identical.
     """
     try:
         # Calculate the edit distance
@@ -30,14 +37,17 @@ def edit_distance_score(text1, text2):
     
 def bleu_score(reference, candidate):
     """
-    Calculate the BLEU score between a reference sentence and a candidate sentence.
+    Calculate the BLEU (Bilingual Evaluation Understudy) score between a reference sentence and a candidate sentence.
+
+    BLEU is a metric commonly used for evaluating the quality of machine-translated text. It measures the precision of the
+    candidate sentence's n-grams (contiguous sequences of n items) against the reference sentence.
 
     Parameters:
     - `reference` (str): The reference sentence.
     - `candidate` (str): The candidate sentence.
 
     Returns:
-    - `float`: The BLEU score.
+    - `float`: The BLEU score. The score ranges from 0 (no similarity) to 1 (perfect match).
     """
     try:
         # Tokenize the reference and candidate sentences
@@ -49,4 +59,29 @@ def bleu_score(reference, candidate):
         return bleu
     except Exception as e:
         print(f"An error occurred during BLEU score calculation: {str(e)}")
-        return 0.0
+        return 0.0
+    
+# DupliPy 0.2.0
+
+def jaccard_similarity_score(text1, text2):
+    """
+    Calculate Jaccard similarity between two texts.
+
+    Jaccard similarity is a measure of similarity between two sets. In the context
+    of text comparison, it calculates the similarity between the sets of words
+    in two texts.
+
+    Parameters:
+    - `text1` (str): The first text for comparison.
+    - `text2` (str): The second text for comparison.
+
+    Returns:
+    - `float`: Jaccard similarity score between the two texts. The score ranges
+      from 0 (no similarity) to 1 (complete similarity).
+    """
+    set1 = set(text1.split())
+    set2 = set(text2.split())
+    intersection = len(set1.intersection(set2))
+    union = len(set1.union(set2))
+    similarity_score = intersection / union if union != 0 else 0
+    return similarity_score
diff --git a/duplipy/text_analysis.py b/duplipy/text_analysis.py
@@ -14,6 +14,9 @@ def analyze_sentiment(text):
     """
     Analyze the sentiment of the input text using NLTK's SentimentIntensityAnalyzer.
 
+    Sentiment analysis assesses the emotional tone of a text, providing a sentiment
+    score ranging from -1 (negative) to 1 (positive).
+    
     Parameters:
     - `text` (str): The input text to be analyzed.
 
diff --git a/readme.md b/readme.md
@@ -1,18 +1,15 @@
-# DupliPy 0.1.9
-![Python Version](https://img.shields.io/badge/python-3.11-blue.svg)
+# DupliPy 0.2.0
+![Python Version](https://img.shields.io/badge/python-3.12-blue.svg)
 ![Code Size](https://img.shields.io/github/languages/code-size/infinitode/duplipy)
 ![Downloads](https://pepy.tech/badge/duplipy)
 ![License Compliance](https://img.shields.io/badge/license-compliance-brightgreen.svg)
 ![PyPI Version](https://img.shields.io/pypi/v/duplipy)
 
 An open source Python library for text formatting, augmentation, and similarity calculation tasks in NLP, the package now also includes additional methods for image augmentation.
 
-## Changes to DupliPy
+## Changes to DupliPy 0.2.0
 
-DupliPy now offers support for image augmentation, with functions to rotate, resize and crop images. These are available through:
-```python
-from duplipy.replication import flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop
-```
+DupliPy now includes useful method descriptions in docstrings, allowing anyone to quickly see what a method does and why it is used. DupliPy also now includes a few extra methods in `replication` and `similarity`, including `shuffle_words()` and `jaccard_similarity_score()` .
 
 ## Installation
 
@@ -32,6 +29,7 @@ DupliPy supports the following Python versions:
 - Python 3.9
 - Python 3.10
 - Python 3.11
+- Python 3.12
 
 Please ensure that you have one of these Python versions installed before using DupliPy. DupliPy may not work as expected on lower versions of Python than the supported.
 
@@ -42,6 +40,9 @@ Please ensure that you have one of these Python versions installed before using
 - Sentiment Analysis: Find impressions within sentences.
 - Similarity Calculation: Calculate text similarity using various metrics.
 - BLEU Score Calculation: Calculate how well your text-based NLP model performs.
+- Image Augmentation Tasks **(NEW)**
+
+*For full reference documentation view [DupliPy's official documentation](https://infinitode-docs.gitbook.io/documentation/package-documentation/duplipy-package-documentation).*
 
 ## Usage
 
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='duplipy',
-    version='0.1.9',
+    version='0.2.0',
     author='Infinitode Pty Ltd',
     author_email='infinitode.ltd@gmail.com',
     description='A package for formatting and text replication, with added support for image augmentation.',
@@ -19,7 +19,7 @@
         'pillow',
     ],
     classifiers=[
-        'Development Status :: 3 - Alpha',
+        'Development Status :: 5 - Production/Stable',
         'Intended Audience :: Developers',
         'License :: OSI Approved :: MIT License',
         'Programming Language :: Python :: 3',
@@ -29,6 +29,7 @@
         'Programming Language :: Python :: 3.9',
         'Programming Language :: Python :: 3.10',
         'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
     ],
     python_requires='>=3.6',
 )