Playing with BeautifulSoup
The starting block, which gives us a function we can throw urls at and have it return a BeautifulSoup object (one of my favourite snippets).
from bs4 import BeautifulSoup
import requests
def lovely_soup(url):
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'})
return BeautifulSoup(r.content, 'lxml')
Let's throw a url at it and see what it does...
soup = lovely_soup('https://recycledrobot.co.uk')
print(soup)
You'll see a load of HTML fly past you. That's the source of the website we want. Now we'll use BeautifulSoup to extract an elements text. The subtitle for instance...
soup = lovely_soup('https://recycledrobot.co.uk')
thing = soup.find('span', {'class': 'subtitle'})
print(thing.text)
That's pretty much the basic setup covered. Let's move onto other things. Kittens!
soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
images = soup.findAll('img', {'class': 'wallpapers__image'})
for image in images:
print(image['src'])
Woo. We now have links to kitten images, but they're tiny. We'll have to follow the trail and find the original.
soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
links = soup.findAll('a', {'class': 'wallpapers__link'})
for link in links:
url = 'https://wallpaperscraft.com{}'.format(link['href'])
soup = lovely_soup(url)
url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
url = 'https://wallpaperscraft.com{}'.format(url)
soup = lovely_soup(url)
url = soup.find('a', {'class': 'gui-button_full-height'})['href']
print(url)
We've jumped through hoops and found the original file. Now let's put all that in a function of it's own and go download all these kitten pictures to a directory called "images".
def get_img(get_url):
if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
img_name = 'images/{}'.format(get_url.split('/')[-1])
img = requests.get(get_url).content
with open(img_name, 'wb') as f:
f.write(img)
def get_kittens():
soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
links = soup.findAll('a', {'class': 'wallpapers__link'})
for link in links:
url = 'https://wallpaperscraft.com{}'.format(link['href'])
soup = lovely_soup(url)
url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
url = 'https://wallpaperscraft.com{}'.format(url)
soup = lovely_soup(url)
url = soup.find('a', {'class': 'gui-button_full-height'})['href']
print(url)
get_img(url)
get_kittens()
There's 15 pictures of kittens. Let's go get some text from somewhere. Bad news headlines!
def get_headlines():
soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/')
headlines = soup.find('div', {'class': 'blog-single-inner-cont'})
headlines = headlines.findAll('strong')
for headline in headlines[1:]:
line = headline.text
line = line.lstrip('0123456789.- ').strip()
print(line)
get_headlines()
Ok, now what? We save these to a json file of course...
import json
def get_headlines():
soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/')
headlines = soup.find('div', {'class': 'blog-single-inner-cont'})
headlines = headlines.findAll('strong')
data = {}
for headline in headlines[1:]:
line = headline.text
line = line.lstrip('0123456789.- ').strip()
print(line)
data.update({line: 0})
with open('data.json', 'w+') as f:
json.dump(data, f)
get_headlines()
Hang on... That json file is full of zero values. Best use them wisely. Let's revisit our functions and push some data around.
def get_img(get_url):
if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
img_name = 'images/{}'.format(get_url.split('/')[-1])
img = get(get_url).content
with open(img_name, 'wb') as f:
f.write(img)
return img_name # RETURNING KITTEN LOCATION
def get_kittens():
soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
links = soup.findAll('a', {'class': 'wallpapers__link'})
kittens = [] # CREATE KITTEN LIST
for link in links:
url = 'https://wallpaperscraft.com{}'.format(link['href'])
soup = lovely_soup(url)
url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
url = 'https://wallpaperscraft.com{}'.format(url)
soup = lovely_soup(url)
url = soup.find('a', {'class': 'gui-button_full-height'})['href']
kittens.append(get_img(url)) # COLLECTING KITTENS
return kittens # RETURNING ALL KITTEN LOCATIONS
def get_headlines(kittens):
soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/')
headlines = soup.find('div', {'class': 'blog-single-inner-cont'})
headlines = headlines.findAll('strong')
data = {}
for headline, kitten in zip(headlines[1:], kittens): # LOOP THROUGH KITTENS AND HEADLINES
line = headline.text
line = line.lstrip('0123456789.- ').strip()
print(line)
data.update({line: kitten})
with open('data.json', 'w+') as f:
json.dump(data, f)
kittens = get_kittens() # GET KITTENS
get_headlines(kittens) # PLAY WITH KITTENS
And... Voila! A script which downloads 15 images of kittens along with bad headlines and dumps the data into a json file as image : headline for each entry. Just what we all need. Lovely!
from bs4 import BeautifulSoup
import requests
import json
def lovely_soup(url):
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'})
return BeautifulSoup(r.content, 'lxml')
def get_img(get_url):
if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
img_name = 'images/{}'.format(get_url.split('/')[-1])
img = get(get_url).content
with open(img_name, 'wb') as f:
f.write(img)
return img_name
def get_kittens():
soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
links = soup.findAll('a', {'class': 'wallpapers__link'})
kittens = []
for link in links:
url = 'https://wallpaperscraft.com{}'.format(link['href'])
soup = lovely_soup(url)
url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
url = 'https://wallpaperscraft.com{}'.format(url)
soup = lovely_soup(url)
url = soup.find('a', {'class': 'gui-button_full-height'})['href']
kittens.append(get_img(url))
return kittens
def get_headlines(kittens):
soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/')
headlines = soup.find('div', {'class': 'blog-single-inner-cont'})
headlines = headlines.findAll('strong')
data = {}
for headline, kitten in zip(headlines[1:], kittens):
line = headline.text
line = line.lstrip('0123456789.- ').strip()
data.update({line: kitten})
with open('data.json', 'w+') as f:
json.dump(data, f)
kittens = get_kittens()
get_headlines(kittens)
This is the contents of the JSON file.
{"Eminem Terrified As Daughter Begins Dating Man Raised On His Music": "images/kitten_face_window_fluffy_92898_2046x1333.jpg", "Ways to Make Money While Waiting for Disability Benefits": "images/kitten_ball_thread_white_background_95135_3000x2399.jpg", "How to Have a Healthier and More Productive Home Office": "images/kitten_lying_striped_small_cute_102741_3872x2592.jpg", "A Little Mistake That Cost a Farmer $3,000 a Year": "images/kitten_sleeping_baby_striped_89331_1600x1200.jpg", "Are You Making These Embarrassing Mistakes at Work?": "images/kitten_cat_grass_123220_4272x2848.jpg", "Lose 8 Pounds in 2 Weeks": "images/kitten_fluffy_face_rose_grass_look_85837_2048x1432.jpg", "How Many of These Italian Foods Have You Tried?": "images/kitten_fluffy_look_95571_1920x1200.jpg", "What\u2019s Scarier Than the Sex Talk? Talking About Food & Weight": "images/kitten_briton_look_kid_96414_3000x2000.jpg", "More Than Half of Medical Advice on \u2018Dr. Oz\u2019 Lacks Proof or Contradicts Best Available Science": "images/kitten_cat_computer_keyboard_apple_mac_black_and_white_94218_1920x1280.jpg", "Lack Time? Here Are 4 Convenient Ways to Keep Your Dog Fit": "images/kitten_cat_playful_down_paw_95904_2560x1600.jpg", "How One Stupid Tweet Blew Up Justine Sacco\u2019s Life": "images/kitten_dark_lying_71730_1920x1200.jpg", "10 Signs That You Will NOT Make It As A Successful Photographer": "images/kitten_fluffy_grass_flowers_run_96294_2048x1365.jpg", "Sure-Fire Ways to Ruin Your Marriage": "images/kitten_look_surprise_striped_85821_1920x1538.jpg", "10 Different Types of Girlfriends \u2013 Which One Are You?": "images/kitten_protruding_tongue_photoshoot_pillow_97666_2048x1365.jpg", "More of Us May Be \u201cAlmost Alcoholics\u201d": "images/kitten_fluffy_look_kid_97036_3393x2248.jpg"}
This is how you parse it.
import json
with open('data.json') as f:
data = json.load(f)
for item in data:
print(item)
print(data.get(item))
But what to do with it?
Tweet it
import json
import tweepy
from random import randint
consumer_key = 'XXXX'
consumer_secret = 'XXXX'
access_key = 'XXXX'
access_secret = 'XXXX'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
def tweet_kitten():
with open('data.json') as f:
data = json.load(f)
pick = randint(1, len(data))
c = 0
for item in data:
c += 1
if c == pick:
api.update_with_media(data.get(item), item)
return True
tweet_kitten()
Instaspam it
from InstagramAPI import InstagramAPI
from random import randint
import json
instauser = 'XXXX'
instapass = 'XXXX'
api = InstagramAPI(instauser, instapass)
api.login()
with open('data.json') as f:
data = json.load(f)
pick = randint(1, len(data))
c = 0
for item in data:
c += 1
if c == pick:
img = data.get(item)
api.uploadPhoto(img, caption=item)
api.logout()
Build a website
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Playing with BeautifulSoup</title>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.7.2/css/bulma.min.css">
<style>
.kittens {
padding-top: 3em;
padding-bottom: 3em;
}
.image img {
object-fit: cover;
}
.line-clamp {
display: block;
display: -webkit-box;
-webkit-box-orient: vertical;
position: relative;
line-height: 1.2;
overflow: hidden;
text-overflow: ellipsis;
padding: 0 !important;
}
.line-clamp:after {
content: '...';
text-align: right;
bottom: 0;
right: 0;
width: 25%;
display: block;
position: absolute;
height: calc(1em * 1.2);
background: linear-gradient(to right, rgba(255, 255, 255, 0), rgba(255, 255, 255, 1) 75%);
}
@supports (-webkit-line-clamp: 1) {
.line-clamp:after {
display: none !important;
}
}
.line-clamp-2 {
-webkit-line-clamp: 2;
height: calc(1em * 1.2 * 2);
}
.footer {
padding-top: 6em;
padding-bottom: 6em;
}
.footer span {
text-decoration: line-through;
}
</style>
</head>
<body>
<section class="hero is-black is-medium">
<div class="hero-body">
<div class="container">
<h1 class="title">Web Scraping With Python</h1>
<h2 class="subtitle">Playing with BeautifulSoup</h2>
</div>
</div>
</section>
<section class="section">
<div class="container">
<div class="columns is-multiline kittens"></div>
</div>
</section>
<footer class="footer">
<div class="content has-text-centered">
<p>Made with <span>love</span> a keyboard</p>
</div>
</footer>
<script src="https://code.jquery.com/jquery-3.3.1.min.js"></script>
<script>
$.getJSON("data.json", function(data) {
$.each(data, function(title, image) {
$('.kittens').append(
`<div class='column is-4'>
<div class="card">
<div class="card-image">
<figure class="image is-4by3">
<img src="${image}" alt="${title}">
</figure>
</div>
<div class="card-content">
<div class="media">
<div class="media-content">
<p class="title is-6 line-clamp line-clamp-2">${title}</p>
</div>
</div>
</div>
</div>
</div>`
);
});
});
</script>
</body>
</html>
Stealing 743 kittens
DISCLAIMER - As always... This is an example and is more than likely against all the rules. Use at your own risk!
I was asked if I could cover downloading all the kitten images instead of just the first page.
from bs4 import BeautifulSoup
from requests import get
from fake_useragent import UserAgent
ua = UserAgent()
def lovely_soup(u):
h = {'User-Agent': ua.chrome}
r = get(u, headers=h)
c = r.content
return BeautifulSoup(c, 'lxml')
def get_img(get_url):
if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')):
img_name = 'images/{}'.format(get_url.split('/')[-1])
img = get(get_url).content
with open(img_name, 'wb') as f:
f.write(img)
return img_name
def get_all_kittens():
soup = lovely_soup('https://wallpaperscraft.com/tag/kitten')
total = soup.find('li', {'class': 'pager__item pager__item_last-page'}).find('a')['href']
total = int(''.join(filter(lambda x: x.isdigit(), total))) + 1
kittens = []
for i in range(1, int(total)):
soup = lovely_soup(f'https://wallpaperscraft.com/tag/kitten/page{i}')
links = soup.findAll('a', {'class': 'wallpapers__link'})
for link in links:
url = 'https://wallpaperscraft.com{}'.format(link['href'])
soup = lovely_soup(url)
url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href']
url = 'https://wallpaperscraft.com{}'.format(url)
soup = lovely_soup(url)
url = soup.find('a', {'class': 'gui-button_full-height'})['href']
kittens.append(get_img(url))
print(len(kittens))
if __name__ == '__main__':
get_all_kittens()
Thanks for reading. x
Resources
- Python: https://python.org
- Json: https://docs.python-guide.org/scenarios/json
- Requests: http://docs.python-requests.org/
- BeautifulSoup: https://pypi.org/project/beautifulsoup4
- Fake Useragent: https://pypi.org/project/fake-useragent
- Jquery: https://jquery.com
- Bulma: https://bulma.io/
- Shout out to the r/learnpython mods for being awesome