Web Scraping With Python



Playing with BeautifulSoup

The starting block, which gives us a function we can throw urls at and have it return a BeautifulSoup object (one of my favourite snippets).

from bs4 import BeautifulSoup
from requests import get
from fake_useragent import UserAgent

ua = UserAgent()

def lovely_soup(u):
    r = get(u, headers={'User-Agent': ua.chrome})
    return BeautifulSoup(r.text, 'lxml')

soup = lovely_soup()

Let's throw a url at it and see what it does...

soup = lovely_soup('https://recycledrobot.co.uk')
print(soup)

You'll see a load of HTML fly past you. That's the source of the website we want. Now we'll use BeautifulSoup to extract an elements text. The subtitle for instance...

soup = lovely_soup('https://recycledrobot.co.uk')
thing = soup.find('span', {'class': 'subtitle'})
print(thing.text)

That's pretty much the basic setup covered. Let's move onto other things. Kittens!

soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
images = soup.findAll('img', {'class': 'wallpapers__image'})

for image in images:
    print(image['src'])

Woo. We now have links to kitten images, but they're tiny. We'll have to follow the trail and find the original.

soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
links = soup.findAll('a', {'class': 'wallpapers__link'})

for link in links:
    url = 'https://wallpaperscraft.com{}'.format(link['href'])
    soup = lovely_soup(url)
    url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
    url = 'https://wallpaperscraft.com{}'.format(url)
    soup = lovely_soup(url)
    url = soup.find('a', {'class': 'gui-button_full-height'})['href']
    print(url)

We've jumped through hoops and found the original file. Now let's put all that in a function of it's own and go download all these kitten pictures to a directory called "images".

def get_img(get_url):
    if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
        img_name = 'images/{}'.format(get_url.split('/')[-1])
        img = get(get_url).content
        with open(img_name, 'wb') as f:
            f.write(img)

def get_kittens():
    soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
    links = soup.findAll('a', {'class': 'wallpapers__link'})

    for link in links:
        url = 'https://wallpaperscraft.com{}'.format(link['href'])
        soup = lovely_soup(url)
        url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
        url = 'https://wallpaperscraft.com{}'.format(url)
        soup = lovely_soup(url)
        url = soup.find('a', {'class': 'gui-button_full-height'})['href']
        print(url)
        get_img(url)

get_kittens()

There's 15 pictures of kittens. Let's go get some text from somewhere. Bad news headlines!

def get_headlines():
    soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/')
    headlines = soup.find('div', {'class': 'blog-single-inner-cont'})
    headlines = headlines.findAll('strong')

    for headline in headlines[1:]:
        line = headline.text
        line = line.lstrip('0123456789.- ').strip()
        print(line)

get_headlines()

Ok, now what? We save these to a json file of course...

import json

def get_headlines():
    soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/')
    headlines = soup.find('div', {'class': 'blog-single-inner-cont'})
    headlines = headlines.findAll('strong')
    data = {}

    for headline in headlines[1:]:
        line = headline.text
        line = line.lstrip('0123456789.- ').strip()
        print(line)
        data.update({line: 0})

    with open('data.json', 'w+') as f:
        json.dump(data, f)

get_headlines()

Hang on... That json file is full of zero values. Best use them wisely. Let's revisit our functions and push some data around.

def get_img(get_url):
    if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
        img_name = 'images/{}'.format(get_url.split('/')[-1])
        img = get(get_url).content
        with open(img_name, 'wb') as f:
            f.write(img)
        return img_name # RETURNING KITTEN LOCATION

def get_kittens():
    soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
    links = soup.findAll('a', {'class': 'wallpapers__link'})
    kittens = [] # CREATE KITTEN LIST

    for link in links:
        url = 'https://wallpaperscraft.com{}'.format(link['href'])
        soup = lovely_soup(url)
        url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
        url = 'https://wallpaperscraft.com{}'.format(url)
        soup = lovely_soup(url)
        url = soup.find('a', {'class': 'gui-button_full-height'})['href']
        kittens.append(get_img(url)) # COLLECTING KITTENS
    return kittens # RETURNING ALL KITTEN LOCATIONS

def get_headlines(kittens):
    soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/')
    headlines = soup.find('div', {'class': 'blog-single-inner-cont'})
    headlines = headlines.findAll('strong')
    data = {}
    for headline, kitten in zip(headlines[1:], kittens): # LOOP THROUGH KITTENS AND HEADLINES
        line = headline.text
        line = line.lstrip('0123456789.- ').strip()
        print(line)
        data.update({line: kitten})

    with open('data.json', 'w+') as f:
        json.dump(data, f)

kittens = get_kittens() # GET KITTENS
get_headlines(kittens) # PLAY WITH KITTENS

And... Voila! A script which downloads 15 images of kittens along with bad headlines and dumps the data into a json file as image : headline for each entry. Just what we all need. Lovely!

from bs4 import BeautifulSoup
from requests import get
from fake_useragent import UserAgent
import json

ua = UserAgent()

def lovely_soup(u):
    h = {'User-Agent': ua.chrome}
    r = get(u, headers=h)
    c = r.text
    return BeautifulSoup(c, 'lxml')

def get_img(get_url):
    if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
        img_name = 'images/{}'.format(get_url.split('/')[-1])
        img = get(get_url).content
        with open(img_name, 'wb') as f:
            f.write(img)
        return img_name

def get_kittens():
    soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
    links = soup.findAll('a', {'class': 'wallpapers__link'})
    kittens = []

    for link in links:
        url = 'https://wallpaperscraft.com{}'.format(link['href'])
        soup = lovely_soup(url)
        url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
        url = 'https://wallpaperscraft.com{}'.format(url)
        soup = lovely_soup(url)
        url = soup.find('a', {'class': 'gui-button_full-height'})['href']
        kittens.append(get_img(url))
    return kittens

def get_headlines(kittens):
    soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/')
    headlines = soup.find('div', {'class': 'blog-single-inner-cont'})
    headlines = headlines.findAll('strong')
    data = {}
    for headline, kitten in zip(headlines[1:], kittens):
        line = headline.text
        line = line.lstrip('0123456789.- ').strip()
        data.update({line: kitten})

    with open('data.json', 'w+') as f:
        json.dump(data, f)

kittens = get_kittens()
get_headlines(kittens)

This is the contents of the JSON file.

{"Eminem Terrified As Daughter Begins Dating Man Raised On His Music": "images/kitten_face_window_fluffy_92898_2046x1333.jpg", "Ways to Make Money While Waiting for Disability Benefits": "images/kitten_ball_thread_white_background_95135_3000x2399.jpg", "How to Have a Healthier and More Productive Home Office": "images/kitten_lying_striped_small_cute_102741_3872x2592.jpg", "A Little Mistake That Cost a Farmer $3,000 a Year": "images/kitten_sleeping_baby_striped_89331_1600x1200.jpg", "Are You Making These Embarrassing Mistakes at Work?": "images/kitten_cat_grass_123220_4272x2848.jpg", "Lose 8 Pounds in 2 Weeks": "images/kitten_fluffy_face_rose_grass_look_85837_2048x1432.jpg", "How Many of These Italian Foods Have You Tried?": "images/kitten_fluffy_look_95571_1920x1200.jpg", "What\u2019s Scarier Than the Sex Talk? Talking About Food & Weight": "images/kitten_briton_look_kid_96414_3000x2000.jpg", "More Than Half of Medical Advice on \u2018Dr. Oz\u2019 Lacks Proof or Contradicts Best Available Science": "images/kitten_cat_computer_keyboard_apple_mac_black_and_white_94218_1920x1280.jpg", "Lack Time? Here Are 4 Convenient Ways to Keep Your Dog Fit": "images/kitten_cat_playful_down_paw_95904_2560x1600.jpg", "How One Stupid Tweet Blew Up Justine Sacco\u2019s Life": "images/kitten_dark_lying_71730_1920x1200.jpg", "10 Signs That You Will NOT Make It As A Successful Photographer": "images/kitten_fluffy_grass_flowers_run_96294_2048x1365.jpg", "Sure-Fire Ways to Ruin Your Marriage": "images/kitten_look_surprise_striped_85821_1920x1538.jpg", "10 Different Types of Girlfriends \u2013 Which One Are You?": "images/kitten_protruding_tongue_photoshoot_pillow_97666_2048x1365.jpg", "More of Us May Be \u201cAlmost Alcoholics\u201d": "images/kitten_fluffy_look_kid_97036_3393x2248.jpg"}

This is how you parse it.

import json

with open('data.json') as f:
    data = json.load(f)
    for item in data:
        print(item)
        print(data.get(item))

But what to do with it?

Tweet it

import json
import tweepy
from random import randint

consumer_key = 'XXXX'
consumer_secret = 'XXXX'
access_key = 'XXXX'
access_secret = 'XXXX'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

def tweet_kitten():
    with open('data.json') as f:
        data = json.load(f)

    pick = randint(1, len(data))
    c = 0

    for item in data:
        c += 1
        if c == pick:
            api.update_with_media(data.get(item), item)
            return True

tweet_kitten()

Instaspam it

from InstagramAPI import InstagramAPI
from random import randint
import json

instauser = 'XXXX'
instapass = 'XXXX'

api = InstagramAPI(instauser, instapass)
api.login()

with open('data.json') as f:
    data = json.load(f)

pick = randint(1, len(data))
c = 0
for item in data:
    c += 1
    if c == pick:
        img = data.get(item)
        api.uploadPhoto(img, caption=item)

api.logout()

Build a website: https://recycledrobot.co.uk/kittens

<!DOCTYPE html>
<html>

<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>Playing with BeautifulSoup</title>
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.7.2/css/bulma.min.css">
  <style>
    .kittens {
      padding-top: 3em;
      padding-bottom: 3em;
    }

    .image img {
      object-fit: cover;
    }

    .line-clamp {
      display: block;
      display: -webkit-box;
      -webkit-box-orient: vertical;
      position: relative;
      line-height: 1.2;
      overflow: hidden;
      text-overflow: ellipsis;
      padding: 0 !important;
    }

    .line-clamp:after {
      content: '...';
      text-align: right;
      bottom: 0;
      right: 0;
      width: 25%;
      display: block;
      position: absolute;
      height: calc(1em * 1.2);
      background: linear-gradient(to right, rgba(255, 255, 255, 0), rgba(255, 255, 255, 1) 75%);
    }

    @supports (-webkit-line-clamp: 1) {
      .line-clamp:after {
        display: none !important;
      }
    }

    .line-clamp-2 {
      -webkit-line-clamp: 2;
      height: calc(1em * 1.2 * 2);
    }

    .footer {
      padding-top: 6em;
      padding-bottom: 6em;
    }

    .footer span {
      text-decoration: line-through;
    }
  </style>
</head>

<body>

  <section class="hero is-black is-medium">
    <div class="hero-body">
      <div class="container">
        <h1 class="title">Web Scraping With Python</h1>
        <h2 class="subtitle">Playing with BeautifulSoup</h2>
      </div>
    </div>
  </section>

  <section class="section">
    <div class="container">
      <div class="columns is-multiline kittens"></div>
    </div>
  </section>

  <footer class="footer">
    <div class="content has-text-centered">
      <p>Made with <span>love</span> a keyboard</p>
    </div>
  </footer>

  <script src="https://code.jquery.com/jquery-3.3.1.min.js"></script>
  <script>
    $.getJSON("data.json", function(data) {
      $.each(data, function(title, image) {
        $('.kittens').append(
          `<div class='column is-4'>
              <div class="card">
                <div class="card-image">
                  <figure class="image is-4by3">
                    <img src="${image}" alt="${title}">
                  </figure>
                </div>
                <div class="card-content">
                  <div class="media">
                    <div class="media-content">
                      <p class="title is-6 line-clamp line-clamp-2">${title}</p>
                    </div>
                  </div>
                </div>
              </div>
            </div>`
        );
      });
    });
  </script>

</body>

</html>

Stealing 743 kittens

DISCLAIMER - As always... This is an example and is more than likely against all the rules. Use at your own risk!

I was asked if I could cover downloading all the kitten images instead of just the first page.

from bs4 import BeautifulSoup
from requests import get
from fake_useragent import UserAgent

ua = UserAgent()

def lovely_soup(u):
    h = {'User-Agent': ua.chrome}
    r = get(u, headers=h)
    c = r.text
    return BeautifulSoup(c, 'lxml')

def get_img(get_url):
    if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
        img_name = 'images/{}'.format(get_url.split('/')[-1])
        img = get(get_url).content
        with open(img_name, 'wb') as f:
            f.write(img)
        return img_name

def get_all_kittens():
    soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
    total = soup.find('li', {'class': 'pager__item pager__item_last-page'}).find('a')['href']
    total = int(''.join(filter(lambda x: x.isdigit(), total))) + 1
    kittens = []

    for i in range(1, int(total)):
        soup = lovely_soup(f'https://wallpaperscraft.com/tag/kitten/page{i}')
        links = soup.findAll('a', {'class': 'wallpapers__link'})

        for link in links:
            url = 'https://wallpaperscraft.com{}'.format(link['href'])
            soup = lovely_soup(url)
            url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
            url = 'https://wallpaperscraft.com{}'.format(url)
            soup = lovely_soup(url)
            url = soup.find('a', {'class': 'gui-button_full-height'})['href']
            kittens.append(get_img(url))
            print(len(kittens))

get_all_kittens()

Thanks for reading. x

Resources