My sample script scrapes movie scripts from Scripts.com and saves the text in a .txt file
#Jenna Ollen
#
#movie_script_scraper.py
#
#VERSION 1.0
#
#LAST EDIT: 2020-02-10
#
#This code scrapes movie scripts from Scripts.com and saves the text in a .txt file.
#It has two optional parameters: a path to save the .txt file and a scripts.com link.
#Some of the code was given to me in Professor Dan Parker's LING 380 Class
#in the SPRING 2019 and adapted by me.
#
#########################################
#REQUIRED MODULES
#########################################
import argparse
import requests
import re
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup
#########################################
#FUNCTIONS
#########################################
def pullText(link):
= requests.Session()
session = session.get(link)
page_response = BeautifulSoup(page_response.content, 'html.parser')
page_content = page_content.find_all('p')
page_text =re.sub('<[^>]*>', '', str(page_text))
cleanreturn (str(clean))
def fileCreate(path_name,script_name):
=re.findall("https://www.scripts.com/script/(.*)\_[0-9]", script_name)
namewith open(os.path.join(path_name, name[0]+'.txt'), 'w+') as file:
file.write(pullText(script_name))
return
#########################################
#MAIN
#########################################
# Check command-line arguments
= argparse.ArgumentParser(description="This code scrapes movie scripts from Scripts.com and saves the text in a .txt file." )
p "-p", "--path", default=".",
p.add_argument(help="Path to save .txt file")
"--link", default="https://www.scripts.com/script/a_cinderella_story_5578",
p.add_argument(help="Link to script.com link to movie script")
= p.parse_args()
args fileCreate(args.path,args.link)