The following code scrapes a list of websites using Python. Most of the explanations are provided within the code as comments.
We first import a few packages to assist with scraping, and set the working directory.
#Import Packages
import requests
from bs4 import BeautifulSoup
import lxml
import os
import re
import pandas as pd
import numpy as np
from urllib.parse import urljoin
#Set Working Directory
os.getcwd()
os.chdir("Your/File/Path/Here")
Once the environment is set up, I wrote a variety of functions to conduct different type of scrapes. Doing it this way will allow me to create more flexible scrapes in the future, and also allows me to avoid repeating the same command twice.
#Define functions to scrape website
def scrape_prep(link, home):
"""This function prepares a URL for further processing. You provide two arguments:
link=the link you want to scrape and prepare
home=for use with sub-menus. if the link you provide is a menu link (so, '/Services' instead of a full site)
you put the URL where the menu originates here ('www.awebsite.com').
If link is a regular site, place an empty string here."""
#Prepare the url
olink=link
nhome=home
match = re.search(r"http", home)
if match==None:
nhome=("http://"+home)
www = re.search(r"www.", link)
if www==None:
olink = urljoin(nhome,link)
match = re.search(r"http", olink)
if match==None:
olink=("http://"+link)
#print(olink) #turn this on to troubleshoot
#Import the text
try:
r = requests.get(olink)
except:
print(olink, "Error!")
return ""
html_doc = r.text
#Turn it into soup
soup = BeautifulSoup(html_doc, 'lxml')
psoup = BeautifulSoup.prettify(soup)
#Return the edited document
return soup
def scrape_main(soup):
"""This returns all of the text in the provided soup, without cleaning or editing. Will include menu items."""
alltxt = soup.get_text()
return alltxt
def scrape_menu(soup, types):
"""This returns all menu items. Depending on the website, it may include sub-menu items.
soup=soup of website you want
types= what type of return you want:
'link' returns menu links
'text' returns the menu names
'dict' returns a dictionary where the menu names are keys and the URLs are values
Can be told to grab the text, links, or a dictionary that contains both."""
#Initialize lists
menu = []
menlk = []
#Loop over the items and fill lists with names and URLs.
for tag in soup.find_all("li"):
#Get text
for a in tag.find_all("a"):
tagname=[]
for name in a.text.split('\n'):
if len(name) > 0:
menu.append(name.strip())
taglen=len(tagname)
#print("Tags:", taglen)
#fulltag = " ".join(tagname)
#menu.append(tagname)
#Get links
menu_links=tag.find_all("a")
lks = []
for link in menu_links:
lks.append(link.get("href"))
#print("Links: ",len(lks))
if lks!=[]:
menlk.append(lks[0])
else:
menlk.append("")
#Return the appropriate information to user, based on string "types"
if types=="link":
return menlk
if types=="text":
return menu
if types=="dict":
key=menu
values=menlk
dictionary = dict(zip(key, values))
return dictionary
def skinny_scrape(soup):
"""This scrapes all tagged 'paragraphs' from the website. May miss some information.
Provide soup (which can be generated by scrape_prep)"""
parags=[]
for tag in soup.find_all("p"):
for name in tag.text.split('\n'):
if len(name) > 0:
parags.append(name.strip())
return parags
def scrape_links(soup):
"""Give this function the soup and it will return all links from the site as a list"""
links=[]
for lk in soup.find_all("a"):
link = lk.find('href')
links.append(link)
return links
def scrape_select(dic, term, types, home):
"""This function pulls from the menu, opening the link associated with a term and getting requested contents:
menu links and names, paragraph content
It accepts four arguments:
dic=dictionary which should include tab names and links (note that scrape_menu can provide this)
term=the term you want to find in the menu
types=the type of return you want, which accepts:
tabname (you want the name of the tab that contained the search term)
parags (you want the plain text from the page)
lks (you want the links)
home=the home URL for the site (as string)
"""
soup=""
tabname=""
for each in dic:
match = re.search(term, each)
if match!=None:
if types=="tabname":
tabname=each
return tabname
else:
soup = scrape_prep(dic[each], home)
if soup=="":
#print("No matches")
return ""
if types=="parags":
para=skinny_scrape(soup)
return para
else:
lks=scrape_menu(soup, types)
return lks
def scrape_find(dic: dict, term: str, types:str):
"""This function identifies all matching cases in provided menu, and returns a list.
Arguments include:
-dic (your dictionary)
-term (what you want to search for)
-types (what type of output you want)
-types can be either 'text' or 'link'"""
matches=[]
for each in dic:
match=re.search(term, each)
if match!=None:
if types=="text":
matches.append(each)
if types=="link":
matches.append(dic[each])
return matches
Once everything is ready to go, we need to actually read in the dataframe. My dataframe has a variety of hospital characteristics, but for the purposes of this scrape, I only use the “url”. I read it in, and then initialize variables that I will use when I scrape the site.
#Import Dataset
##CSV file with URLs and other hospital data, read in as a pandas dataframe
georgia = pd.read_csv("appended2.csv")
#Create a list object of just the URLs (in my dataset, the urls are under a column named "url".
urls = georgia.url
#Create new variables to populate (used in next section)
georgia['soup']=np.nan
georgia['menu']= np.nan
georgia['hometext']=np.nan
georgia['bartext']=np.nan
georgia['bartab']=np.nan
georgia['davinci']=np.nan
georgia['datab']=np.nan
georgia['misstab']=np.nan
georgia['misstext']=np.nan
Finally, I use a for loop to scrape every site in the list.
#Scrape Content from all URLs
##This iterates through any list of URLs--I compiled my list through www.ahd.com using a free education account.
###If you want to publish, AHD may provide you free access to their data, even without the EDU account.
i=0 #initialize count
for url in urls:
#print(url) #Turn on to troubleshoot
if url is not np.nan:
#Prep the url for scraping
urlsoup=scrape_prep(url, "") ##will return a blank string if the URL is broken and print "Error!"
if urlsoup != "":
#Get the menu
menu = scrape_menu(urlsoup, "dict")
mentxt = [*menu] #changes menu into a list of keys
georgia.menu[i] = "; ".join(mentxt)
#Get the text from home page
text = skinny_scrape(urlsoup)
georgia.hometext[i] = "; ".join(text)
#Get the info for bariatric surgery (if it exists)
barterms = ["Weight Loss", "Weight-Loss", "bariatric", "weightloss", "weight loss", "weight-loss"]
#Get tab names
bartab=""
for term in barterms:
if bartab=="":
bartab = scrape_select(menu, term, "tabname", url)
if bartab != "":
bartext= scrape_select(menu, term, "parags", url)
if bartab!="":
georgia.bartab[i] = bartab
#Get text information
georgia.bartext[i] ="; ".join(bartext)
#Get the Da Vinci info
daterms = ['Da Vinci', "Robotic Surgery", "Robotic"]
datab=""
for term in daterms:
if datab=="":
datab=scrape_select(menu, term, "tabname", url)
if datab!="":
datext=scrape_select(menu, term, "parags", url)
if datab!="":
georgia.datab[i]=datab
georgia.davinci[i]="; ".join(datext)
#Get the mission statements
missterms = ['Mission','Purpose']
termtab=""
for term in missterms:
if termtab=="":
termtab=scrape_select(menu, term, "tabname", url)
if termtab!="":
misstext=scrape_select(menu, term, "parags", url)
if termtab!="":
georgia.misstab[i]=termtab
georgia.misstext[i]="; ".join(misstext)
i=i+1
Once the scraping is done, I check it and then send it to a CSV for analysis in R!
#Export newly created dataset to CSV
georgia.to_csv("georgiatxt2.csv")