I followed a tutorial from Youtube. I tried to do my best to do the exercises before watching the solution by Keith Galli:

https://www.youtube.com/watch?v=GjKQ6V_ViQE

Load web

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

# define the link
my_url = 'https://keithgalli.github.io/web-scraping/webpage.html'
r = requests.get(my_url)

# call the soup function to make the html readable
webpage = bs(r.content)
 

Grab all social links

My attempt

socials = webpage.find_all("ul",{"class":"socials"})

for social in socials:
    links = social.find_all('a')
    for link in links:
        print(link.get('href'))
https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli

suggested solutions

links2 = webpage.select("ul.socials a")
actual_links = [ link['href'] for link in links2]
actual_links
['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

ulist = webpage.find("ul", {"class":"socials"})
links = ulist.find_all("a")
actual_links = [ link['href'] for link in links]
actual_links
['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

links = webpage.select("li.social a")
links
actual_links = [ link['href'] for link in links]
actual_links
['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

Scrape table into dataframe

My attempt

table = webpage.select('table.hockey-stats')
table_rows = table[0].find_all('tr')
print(table_rows)
[<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>, <tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team">
<i><img src="images/flag.png"/></i>
<span class="txt-blue">
<a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
</span>
</td>
<td class="league"> <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a> </td>
<td class="regular gp">17</td>
<td class="regular g">3</td>
<td class="regular a">9</td>
<td class="regular tp">12</td>
<td class="regular pim">20</td>
<td class="regular pm"></td>
<td class="separator"> | </td>
<td class="postseason">
<a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>
</td>
<td class="postseason gp">
</td>
<td class="postseason g">
</td>
<td class="postseason a">
</td>
<td class="postseason tp">
</td>
<td class="postseason pim">
</td>
<td class="postseason pm">
</td>
</tr>, <tr class="team-continent-NA">
<td class="season sorted">
                  2015-16
              </td>
<td class="team">
<i><img src="images/flag.png"/></i>
<span class="txt-blue">
<a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
</span>
</td>
<td class="league"> <a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016"> ACHA II </a> </td>
<td class="regular gp">9</td>
<td class="regular g">1</td>
<td class="regular a">1</td>
<td class="regular tp">2</td>
<td class="regular pim">2</td>
<td class="regular pm"></td>
<td class="separator"> | </td>
<td class="postseason">
<a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016"> </a>
</td>
<td class="postseason gp">
</td>
<td class="postseason g">
</td>
<td class="postseason a">
</td>
<td class="postseason tp">
</td>
<td class="postseason pim">
</td>
<td class="postseason pm">
</td>
</tr>, <tr class="team-continent-NA">
<td class="season sorted">
                  2016-17
              </td>
<td class="team">
<i><img src="images/flag.png"/></i>
<span class="txt-blue">
<a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
</span>
</td>
<td class="league"> <a href="https://www.eliteprospects.com/league/acha-ii/stats/2016-2017"> ACHA II </a> </td>
<td class="regular gp">12</td>
<td class="regular g">5</td>
<td class="regular a">5</td>
<td class="regular tp">10</td>
<td class="regular pim">8</td>
<td class="regular pm">0</td>
<td class="separator"> | </td>
<td class="postseason">
</td>
<td class="postseason gp">
</td>
<td class="postseason g">
</td>
<td class="postseason a">
</td>
<td class="postseason tp">
</td>
<td class="postseason pim">
</td>
<td class="postseason pm">
</td>
</tr>, <tr class="team-continent-EU">
<td class="season sorted">
                  2017-18
              </td>
<td class="team">
                  Did not play
              </td>
<td class="league"> <a href="https://www.eliteprospects.com/stats"> </a> </td>
<td class="regular gp"></td>
<td class="regular g"></td>
<td class="regular a"></td>
<td class="regular tp"></td>
<td class="regular pim"></td>
<td class="regular pm"></td>
<td class="separator"> | </td>
<td class="postseason">
<a href="https://www.eliteprospects.com/stats"> </a>
</td>
<td class="postseason gp">
</td>
<td class="postseason g">
</td>
<td class="postseason a">
</td>
<td class="postseason tp">
</td>
<td class="postseason pim">
</td>
<td class="postseason pm">
</td>
</tr>, <tr class="team-continent-NA">
<td class="season sorted">
                  2018-19
              </td>
<td class="team">
<i><img src="images/flag.png"/></i>
<span class="txt-blue">
<a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
</span>
</td>
<td class="league"> <a href="https://www.eliteprospects.com/league/acha-iii/stats/2018-2019"> ACHA III </a> </td>
<td class="regular gp">8</td>
<td class="regular g">5</td>
<td class="regular a">10</td>
<td class="regular tp">15</td>
<td class="regular pim">8</td>
<td class="regular pm"></td>
<td class="separator"> | </td>
<td class="postseason">
<a href="https://www.eliteprospects.com/league/acha-iii/stats/2018-2019"> </a>
</td>
<td class="postseason gp">
</td>
<td class="postseason g">
</td>
<td class="postseason a">
</td>
<td class="postseason tp">
</td>
<td class="postseason pim">
</td>
<td class="postseason pm">
</td>
</tr>]

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l)
df
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
0 None None None None None None None None None None None None None None None None None
1 \n 2014-15\n \n\n\n MIT (Mass. Inst. of Tech.) \n\n ACHA II 17 3 9 12 20 | \n \n \n \n \n \n \n \n
2 \n 2015-16\n \n\n\n MIT (Mass. Inst. of Tech.) \n\n ACHA II 9 1 1 2 2 | \n \n \n \n \n \n \n \n
3 \n 2016-17\n \n\n\n MIT (Mass. Inst. of Tech.) \n\n ACHA II 12 5 5 10 8 0 | \n \n \n \n \n \n \n
4 \n 2017-18\n \n Did not play\n | \n \n \n \n \n \n \n \n
5 \n 2018-19\n \n\n\n MIT (Mass. Inst. of Tech.) \n\n ACHA III 8 5 10 15 8 | \n \n \n \n \n \n \n \n

Suggested solutions

table = webpage.select('table.hockey-stats')[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = table.find("tbody").find_all("tr")
l = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns = column_names)
df
S Team League GP G A TP PIM +/- POST GP G A TP PIM +/-
0 2014-15 MIT (Mass. Inst. of Tech.) ACHA II 17 3 9 12 20 |
1 2015-16 MIT (Mass. Inst. of Tech.) ACHA II 9 1 1 2 2 |
2 2016-17 MIT (Mass. Inst. of Tech.) ACHA II 12 5 5 10 8 0 |
3 2017-18 Did not play |
4 2018-19 MIT (Mass. Inst. of Tech.) ACHA III 8 5 10 15 8 |

Grab all fun fact with word is

My attempt

fun_facts = webpage.find_all("ul",{"class":"fun-facts"})[0]
fun_facts
<ul class="fun-facts">
<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>
<li>Middle name is Ronald</li>
<li>Never had been on a plane until college</li>
<li>Dunkin Donuts coffee is better than Starbucks</li>
<li>A favorite book series of mine is <i>Ender's Game</i></li>
<li>Current video game of choice is <i>Rocket League</i></li>
<li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>
</ul>
import re

for fact in fun_facts.find_all():
    check = fact.text
    if "is" in check:
        print(fact.get_text())
Middle name is Ronald
Dunkin Donuts coffee is better than Starbucks
A favorite book series of mine is Ender's Game
Current video game of choice is Rocket League
The band that I've seen the most times live is the Zac Brown Band

Solutions

facts = webpage.select("ul.fun-facts li")
facts_with_is = [facts.find(string=re.compile("is")) for facts in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is
['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

Download image from webpage

My attempt

imgs = webpage.select("div.row div.column img")
one_pic = imgs[0]['src']

image_url = "https://keithgalli.github.io/web-scraping/"+one_pic
image_url
'https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg'

from PIL import Image
img = Image.open(requests.get(image_url, stream = True).raw)

img.save('image.jpg')

Solution

url = "https://keithgalli.github.io/web-scraping/"
images = webpage.select("div.row div.column img")
image_url = images[0]['src']
full_url = url+image_url
print(full_url)
https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg

img_data = requests.get(full_url).content
with open("lake_como.jpg",'wb') as handler:
    handler.write(img_data)

Final exercise

My attempt

secrets = webpage.select("div.block a")
for secret in secrets:
    link = secret['href']
    my_url_new = url+link
    
    r_new = requests.get(my_url_new)
    webpage_new = bs(r_new.content)
    
    msg = webpage_new.find("p", {"id":"secret-word"}).get_text()
    print(msg)
    
Make
sure
to
smash
that
like
button
and
subscribe
!!!

Solution

files = webpage.select("div.block a")
relative_files = [f['href'] for f in files]
relative_files

for f in relative_files:
    full_url = url+f
    
    page = requests.get(full_url)
    bs_page = bs(page.content)
    
    secret_word_element = bs_page.find("p", {"id":"secret-word"})
    secret_word = secret_word_element.string
    print(secret_word)
    
Make
sure
to
smash
that
like
button
and
subscribe
!!!