BeautifulSoup tutorial - basic web scrapping
My first attempt on web scrapping following a Youtube tutorial by Keith Galli
- Load web
- Grab all social links
- Scrape table into dataframe
- Grab all fun fact with word is
- Download image from webpage
- Final exercise
I followed a tutorial from Youtube. I tried to do my best to do the exercises before watching the solution by Keith Galli:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
# define the link
my_url = 'https://keithgalli.github.io/web-scraping/webpage.html'
r = requests.get(my_url)
# call the soup function to make the html readable
webpage = bs(r.content)
socials = webpage.find_all("ul",{"class":"socials"})
for social in socials:
links = social.find_all('a')
for link in links:
print(link.get('href'))
links2 = webpage.select("ul.socials a")
actual_links = [ link['href'] for link in links2]
actual_links
ulist = webpage.find("ul", {"class":"socials"})
links = ulist.find_all("a")
actual_links = [ link['href'] for link in links]
actual_links
links = webpage.select("li.social a")
links
actual_links = [ link['href'] for link in links]
actual_links
table = webpage.select('table.hockey-stats')
table_rows = table[0].find_all('tr')
print(table_rows)
l = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
l.append(row)
df = pd.DataFrame(l)
df
table = webpage.select('table.hockey-stats')[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]
table_rows = table.find("tbody").find_all("tr")
l = []
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns = column_names)
df
fun_facts = webpage.find_all("ul",{"class":"fun-facts"})[0]
fun_facts
import re
for fact in fun_facts.find_all():
check = fact.text
if "is" in check:
print(fact.get_text())
facts = webpage.select("ul.fun-facts li")
facts_with_is = [facts.find(string=re.compile("is")) for facts in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is
imgs = webpage.select("div.row div.column img")
one_pic = imgs[0]['src']
image_url = "https://keithgalli.github.io/web-scraping/"+one_pic
image_url
from PIL import Image
img = Image.open(requests.get(image_url, stream = True).raw)
img.save('image.jpg')
url = "https://keithgalli.github.io/web-scraping/"
images = webpage.select("div.row div.column img")
image_url = images[0]['src']
full_url = url+image_url
print(full_url)
img_data = requests.get(full_url).content
with open("lake_como.jpg",'wb') as handler:
handler.write(img_data)
secrets = webpage.select("div.block a")
for secret in secrets:
link = secret['href']
my_url_new = url+link
r_new = requests.get(my_url_new)
webpage_new = bs(r_new.content)
msg = webpage_new.find("p", {"id":"secret-word"}).get_text()
print(msg)
files = webpage.select("div.block a")
relative_files = [f['href'] for f in files]
relative_files
for f in relative_files:
full_url = url+f
page = requests.get(full_url)
bs_page = bs(page.content)
secret_word_element = bs_page.find("p", {"id":"secret-word"})
secret_word = secret_word_element.string
print(secret_word)