Excel Spreadsheet Cleaner (.py)
For my honors thesis, I need to convert lots of documents into spreadsheets and clean them. Sometimes, extra whitespace slips through the cracks. This script removes leading, trailing, and duplicate whitespace from .xls and .xlsx files and outputs a new, cleaned .xlsx file.
#!/usr/bin/env python3
#
# VERSION 0.1
#
# LAST EDIT: 2019-09-16
#
# This script removes leading, trailing, and duplicate spaces in
# .xls and .xlsx files and creates a new .xlsx file
#
##############################################################################
# REQUIRED MODULES
##############################################################################
import pandas as pd
import os
import re
import sys
##############################################################################
# FUNCTIONS
##############################################################################
def show_help():
"""
Name: show_help
Inputs: None
Outputs: None
Features: Prints the help text when user uses the '-h' command flag
"""
= ("FILE: \t spreadsheet_cleaner.py\n"
help_txt "DESC: \t Removes leading, trailing, and duplicate spaces\n"
"\t in .xlsx and .xls files in a given directory.\n"
"\t User will be prompted for path to directory\n"
"\t containing Excel files.\n"
"NOTE: \t Do not run on files that are currently open.\n"
"USGE: \t app_name [options]\n"
"\t -h, --help shows the help text\n")
print(help_txt)
##############################################################################
# CLASSES
##############################################################################
class SpreadSheet(object):
"""
Name: SpreadSheet
Features: Class for cleaning Excel files
History: Version 0.1
- Written for class
- Intended to help with Honors data cleaning
"""
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
# Class Parameters
# ////////////////////////////////////////////////////////////////////////
# None
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
# Class Initialization
# ////////////////////////////////////////////////////////////////////////
def __init__(self, base_dir):
"""
Name: SpreadSheet.__init__
Inputs: str, base directory (base_dir)
Features: Initializes the SpreadSheet class
Reference: Way of checking directory is from Prof. Davis's music_randomizer
"""
# Intitialize class variables:
self.file_list = []
if os.path.isdir(base_dir):
self.base_dir = base_dir
else:
self.base_dir = None
raise OSError("Directory '%s' does not exist" % (base_dir))
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
# Class Property Definitions
# ////////////////////////////////////////////////////////////////////////
# None
# \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
# Class Function Definitions
# ////////////////////////////////////////////////////////////////////////
def cleaning(self, file, name):
"""
Name: SpreadSheet.cleaning
Inputs: Excel file
Outputs: New, clean .xlsx file
Features: Removes whitespace and duplicate spaces from pd dataframes and saves new file.
Reference: https://stackoverflow.com/questions/1546226/simple-way-to-remove-multiple-spaces-in-a-string/1546244
"""
= pd.read_excel(file, index_col=None, header=0)
data = list(data.columns)
col_names for col in col_names:
= data[col].astype(str)
data[col] = data[col].str.strip()
data[col] = data[col].str.replace('[ \t\n\r]+', " ")
data[col] = pd.ExcelWriter(name, engine='xlsxwriter')
writer =False)
data.to_excel(writer, index
writer.save()
def get_files(self, base_dir):
"""
Name: SpreadSheet.get_files
Inputs: None.
Outputs: None.
Features: Creates a list of .xls and .xlsx files in the base directory
Reference: https://stackoverflow.com/questions/10377998/how-can-i-iterate-over-files-in-a-given-directory
"""
= base_dir
directory # have already checked if directory exists.
for filename in os.listdir(directory):
if filename.endswith(".xls") or filename.endswith(".xlsx"):
self.file_list.append(os.path.join(directory, filename))
else:
continue
def run(self, base_dir):
"""
Name: SpreadSheet.run
Inputs: file (from get_files), col_names (from get_columns)
Outputs: Clean files
Features: Cleans Excel files and returns renamed files
Reference:
"""
self.get_files(base_dir)
for file_name in self.file_list:
= str(file_name).split(".")
names = "." + str(names[1]) + "_tidy." + str(names[2])
new_name self.cleaning(file_name, new_name)
##############################################################################
# MAIN
##############################################################################
if __name__ == "__main__":
if '-h' in sys.argv or '--help' in sys.argv:
show_help()else:
= str(input("Type path to directory: "))
input_path = SpreadSheet(input_path)
utility utility.run(input_path)