I am a senior, double majoring in Data Science and English.
I made this script to clean text, so it can be used for natural language processing. It can be frustrating to repeat these steps for any NLP task, so this script takes care of that for you by taking a .txt file and performing common preprocessing tasks. It then outputs a new .txt file with the final text after having lowercased; removed numbers, punctuation, and stopwords; and tokenized and lemmatized the text.
# text_preprocessing.py
# By: Caroline Wall
# Version 1.0
# Last Edit: 2021-09-18
# This script performs text preprocessing on a .txt file by removing
# stop words, removing numbers, removing punctuation, lowercasing,
# tokenizing, and lemmatizing. It then returns the preprocessed text
# in a .txt file.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
nltk.download()
# User inputs information about the text
= input('Enter file path to .txt file: ')
path_to_file = input('Enter the name you want for your preprocessed data file: ')
output_file = input('Enter text language: ')
language = input('Enter common words in text: ')
extra_stops
# Read in the text
= open('example.txt', 'r')
text_file = text_file.read()
text
text_file.close()
# Create list of stopwords, combining NLTK's set with user-inputed words
= set(stopwords.words(language))
stop_words = extra_stops.translate(extra_stops.maketrans('','',string.punctuation)).lower()
extra_stops = extra_stops.split(' ')
extra_stops for item in extra_stops:
stop_words.add(item)
# Remove all numbers from the text
= text.translate(text.maketrans('','',string.digits))
text_no_nums
# Lowercase text and remove all punctuation
= text_no_nums.translate(text_no_nums.maketrans('','',string.punctuation)).lower()
clean_text
# Tokenize the text
= word_tokenize(clean_text)
words
# Remove stopwords
= [word for word in words if word not in stop_words]
clean_text_no_stops
# Perform lemmatization
= []
final_text = WordNetLemmatizer()
lemmatizer for word in clean_text_no_stops:
= lemmatizer.lemmatize(word)
lemm_word += [lemm_word]
final_text
# Create .txt file of preprocessed data using requested file name
print(final_text, file = open(output_file, 'a'))
print('Your preprocessed text data has been created.')