Updated DupliPy

Infinitode · Infinitode · commit c8e1292c3ab6 · 2024-07-03T13:54:38.000+02:00
diff --git a/.github/workflows/build_and_publish.yml b/.github/workflows/build_and_publish.yml
@@ -0,0 +1,34 @@
+name: Publish Python 🐍 distributions 📦 to PyPI
+
+on:
+  push:
+    tags:
+       - '*'
+
+jobs:
+  build-n-publish:
+    name: Build and publish Python 🐍 distributions 📦 to PyPI
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.12'
+    - name: Install pypa/setuptools
+      run: >-
+        python -m
+        pip install setuptools wheel
+    - name: Extract tag name
+      id: tag
+      run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
+    - name: Update version in setup.py
+      run: >-
+        sed -i "s/{{VERSION_PLACEHOLDER}}/${{ steps.tag.outputs.TAG_NAME }}/g" setup.py
+    - name: Build a binary wheel
+      run: >-
+        python setup.py sdist bdist_wheel
+    - name: Publish distribution 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@master
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+*.pyc
+dist/
+build/
+venv/
diff --git a/duplipy/__init__.py b/duplipy/__init__.py
@@ -1,5 +1,5 @@
 import duplipy
-from .formatting import remove_stopwords, remove_numbers, remove_whitespace, normalize_whitespace, separate_symbols, remove_special_characters, standardize_text, tokenize_text, stem_words, lemmatize_words, pos_tag
+from .formatting import remove_stopwords, remove_numbers, remove_whitespace, normalize_whitespace, separate_symbols, remove_special_characters, standardize_text, tokenize_text, stem_words, lemmatize_words, pos_tag, remove_profanity_from_text, remove_sensitive_info_from_text, remove_hate_speech_from_text
 from .replication import replace_word_with_synonym, augment_text_with_synonyms, load_text_file, augment_file_with_synonyms, insert_random_word, delete_random_word, insert_synonym, paraphrase, flip_horizontal, flip_vertical, rotate, random_rotation, resize, crop, random_crop, shuffle_words
 from .similarity import edit_distance_score, bleu_score, jaccard_similarity_score
 from .text_analysis import analyze_sentiment
diff --git a/duplipy/formatting.py b/duplipy/formatting.py
@@ -13,12 +13,16 @@
 - `stem_words(words)`: Stem the input words using Porter stemming algorithm.
 - `lemmatize_words(words)`: Lemmatize the input words using WordNet lemmatization.
 - `pos_tag(text)`: Perform part-of-speech (POS) tagging on the input text.
+- `remove_profanity_from_text(text)`: Remove profane words from the input text.
+- `remove_sensitive_info_from_text(text)`: Remove sensitive information from the input text.
+- `remove_hate_speech_from_text(text)`: Remove hate speech or offensive speech from the input text.
 """
 
 
 import string
 import re
 import nltk
+from valx import remove_profanity, remove_sensitive_information, detect_hate_speech
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from nltk.stem import PorterStemmer, WordNetLemmatizer
@@ -244,4 +248,62 @@ def pos_tag(text):
         return tagged_words
     except Exception as e:
         print(f"An error occurred during POS tagging: {str(e)}")
-        return []
+        return []
+
+def remove_profanity_from_text(text):
+    """
+    Remove profane words from the input text.
+
+    This ensures that text is clean and does not contain inappropriate language.
+    
+    Parameters:
+    - `text` (str): The input text to remove profanity from.
+
+    Returns:
+    - `text` (str): The cleaned output text.
+    """
+    sentences = nltk.sent_tokenize(text)
+    cleaned_sentences = remove_profanity(sentences, language='All')
+    cleaned_text = '. '.join(cleaned_sentences)
+
+    return cleaned_text
+
+def remove_sensitive_info_from_text(text):
+    """
+    Remove sensitive information from the input text.
+
+    This can be useful for depersonalization of text data.
+    
+    Parameters:
+    - `text` (str): The input text to remove sensitive information from.
+
+    Returns:
+    - `text` (str): The cleaned output text.
+    """
+    sentences = nltk.sent_tokenize(text)
+    cleaned_sentences = remove_sensitive_information(sentences)
+    cleaned_text = '. '.join(cleaned_sentences)
+
+    return cleaned_text
+
+def remove_hate_speech_from_text(text):
+    """
+    Remove hate speech or offensive speech from the input text.
+
+    This function removes sentences, and not just a certain word, because it is context relevant.
+    
+    Parameters:
+    - `text` (str): The input text to remove hate speech and offensive speech from.
+
+    Returns:
+    - `text` (str): The cleaned output text.
+    """
+    sentences = nltk.sent_tokenize(text)
+    cleaned_sentences = []
+    for sentence in sentences:
+        outcome = detect_hate_speech(sentence)
+        if outcome != ['Hate Speech'] and outcome != ['Offensive Speech'] and outcome == ['No Hate and Offensive Speech']:
+            cleaned_sentences.append(sentence)
+    cleaned_text = '. '.join(cleaned_sentences)
+
+    return cleaned_text
diff --git a/publish steps.txt b/publish steps.txt
@@ -0,0 +1,43 @@
+To publish your Python package using Twine, you'll need to perform a few steps. First, make sure you have the following prerequisites installed:
+
+Setuptools: This is typically used for packaging Python projects.
+
+bash
+Copy code
+pip install setuptools
+Wheel: This is a built-package format that can be installed with pip.
+
+bash
+Copy code
+pip install wheel
+Twine: This is a utility for publishing Python packages on the Python Package Index (PyPI).
+
+bash
+Copy code
+pip install twine
+Once you have these installed, follow these steps to publish your package:
+
+1. Package your Project
+Navigate to your project's root directory in the terminal and run the following command to create a source distribution and a wheel distribution:
+
+bash
+Copy code
+python setup.py sdist bdist_wheel
+This command will generate a dist directory containing your packaged project.
+
+2. Create a PyPI Account
+Make sure you have an account on the PyPI website. You'll need this account to upload your package.
+
+3. Upload your Package
+Use Twine to upload your package to PyPI:
+
+bash
+Copy code
+twine upload dist/* -u __token__ -p pypi-token
+This command uploads all files in the dist directory to PyPI.
+
+4. Enter your PyPI Credentials
+Twine will prompt you to enter your PyPI username and password. Enter the credentials associated with your PyPI account.
+
+5. Verify your Package on PyPI
+Visit your project's page on PyPI to verify that your package has been successfully uploaded.
diff --git a/readme.md b/readme.md
@@ -1,4 +1,4 @@
-# DupliPy 0.2.0
+# DupliPy 0.2.1
 ![Python Version](https://img.shields.io/badge/python-3.12-blue.svg)
 ![Code Size](https://img.shields.io/github/languages/code-size/infinitode/duplipy)
 ![Downloads](https://pepy.tech/badge/duplipy)
@@ -7,9 +7,9 @@
 
 An open source Python library for text formatting, augmentation, and similarity calculation tasks in NLP, the package now also includes additional methods for image augmentation.
 
-## Changes to DupliPy 0.2.0
+## Changes to DupliPy 0.2.1
 
-DupliPy now includes useful method descriptions in docstrings, allowing anyone to quickly see what a method does and why it is used. DupliPy also now includes a few extra methods in `replication` and `similarity`, including `shuffle_words()` and `jaccard_similarity_score()` .
+Duplipy now utilizes another one of our Python packages, called ValX, which provides quick methods we can use to clean and format our text data before training in preprocessing steps.
 
 ## Installation
 
@@ -29,7 +29,7 @@ DupliPy supports the following Python versions:
 - Python 3.9
 - Python 3.10
 - Python 3.11
-- Python 3.12
+- Python 3.12 or later
 
 Please ensure that you have one of these Python versions installed before using DupliPy. DupliPy may not work as expected on lower versions of Python than the supported.
 
@@ -40,7 +40,8 @@ Please ensure that you have one of these Python versions installed before using
 - Sentiment Analysis: Find impressions within sentences.
 - Similarity Calculation: Calculate text similarity using various metrics.
 - BLEU Score Calculation: Calculate how well your text-based NLP model performs.
-- Image Augmentation Tasks **(NEW)**
+- Image Augmentation Tasks.
+- Profanity removal, hate speech removal, offensive speech removal, and sensitive information removal.
 
 *For full reference documentation view [DupliPy's official documentation](https://infinitode-docs.gitbook.io/documentation/package-documentation/duplipy-package-documentation).*
 
@@ -157,6 +158,19 @@ resized_image.save("path/to/resized.jpg")
 randomly_cropped_image.save("path/to/randomly_cropped.jpg")
 ```
 
+### Hate speech and Offensive speech removal using AI
+
+```python
+from duplipy.formatting import remove_hate_speech_from_text
+
+text = "I hate all of you bad word! Can't you just bad word leave me alone! Hi, I'm Katy."
+
+print(remove_hate_speech_from_text(text))
+
+### Output
+# "Hi, I'm Katy."
+```
+
 ## Contributing
 
 Contributions are welcome! If you encounter any issues, have suggestions, or want to contribute to DupliPy, please open an issue or submit a pull request on [GitHub](https://github.com/infinitode/duplipy).
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='duplipy',
-    version='0.2.0',
+    version='{{VERSION_PLACEHOLDER}}',
     author='Infinitode Pty Ltd',
     author_email='infinitode.ltd@gmail.com',
     description='A package for formatting and text replication, with added support for image augmentation.',
@@ -17,6 +17,7 @@
         'joblib',
         'tqdm',
         'pillow',
+        'valx'
     ],
     classifiers=[
         'Development Status :: 5 - Production/Stable',
diff --git a/test.py b/test.py
@@ -0,0 +1,3 @@
+from duplipy.formatting import remove_hate_speech_from_text
+
+print(remove_hate_speech_from_text("Hello friend! Goodbye Fag!"))

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +__pycache__
 +*.pyc
 +dist/
 +build/
 +venv/
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from duplipy.formatting import remove_hate_speech_from_text`
	`2`	`+`
	`3`	`+print(remove_hate_speech_from_text("Hello friend! Goodbye Fag!"))`