Python Programming

Lecture 14 Web Scraping with Python

14.1 Scraping Examples

什么值得买 (smzdm)
Requests: Single Page

import requests
import pandas as pd

headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/70.0.3538.25 \
        Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
params ={"p":1,
       "past_num":20}
r=requests.get("https://www.smzdm.com/homepage/json_more", \
                params=params, headers=headers)
x=r.json()
data= x['data'] 

title=[]; price=[]; date=[]; category=[]; atype=[]; mall=[]
for i in range(len(data)):
    atype.append(data[i]['article_type'])   
    title.append(data[i]['article_title'])
    price.append(data[i]['article_price'])
    date.append(data[i]['article_date'])
    category.append(data[i]['top_category'])
    mall.append(data[i]['article_mall'])

ex_data=pd.DataFrame()
ex_data['article_title']=title
ex_data['article_price']=price
ex_data['article_date']=date
ex_data['top_category']=category
ex_data['article_mall']=mall
print(atype)
print(title)
ex_data.to_excel(excel_writer='smzdm.xlsx', encoding='utf-8')
Requests: Multiple Pages

import requests
import pandas as pd

headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/70.0.3538.25 \
        Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
data=[]
for page in range(1,6):
    params ={"p":page, "past_num":page*20}
    r=requests.get("https://www.smzdm.com/homepage/json_more", \
                    params=params, headers=headers)
    x = r.json()
    data = data + x['data']
丁香医生
Requests + BeautifulSoup

import requests
from bs4 import BeautifulSoup


headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/70.0.3538.25 \
        Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
params ={"query":"腿"}
r=requests.get("https://dxy.com/search/result", \
                 params=params,headers=headers)
print(r.status_code)
print(r.headers)
print(r.cookies)
print(r.url)

soup=BeautifulSoup(r.content, 'lxml')
print(soup.prettify())
print(soup.title.string)
print(soup.title)
print(soup.head)

Tag (标签),Attribute (属性), 节点


print(soup.div) # div标签节点
print(soup.div.attrs) # div标签节点所有属性
print(soup.div['id']) # id属性
print(soup.h2) # h2是标题标签节点 h1, h2,...
print(soup.h2.string) # h2是标题标签包裹的文本

子节点和父节点


print(soup.a.contents) # list为所有的子节点
print(soup.a.children) # iterator,用list()读出
print(soup.a.parent)   # 所有的父节点
print(soup.h2.next_sibling) # 同级的下一个兄弟节点
print(soup.h2.previous_sibling) # 同级的上一个兄弟节点

list_a = soup.find_all(attrs={"class":"content-title-more common-text-link"})
print(list_a)
print(list_a[0]["href"])
all_url = list_a[0]["href"]

r_all = requests.get(all_url)
soup_all = BeautifulSoup(r_all.content, 'lxml')
content_list = soup_all.find_all(attrs={"class":"article-title"})
print(content_list)

article_url = []
article_title = []
for i in range(len(content_list)):
    article_url.append(content_list[i]["href"])
    article_title.append(content_list[i].contents[0].string)

Save to .docx files


import os
import re
import pypandoc
exist=os.path.exists('dingxiang_search_leg') 
if not exist:
    os.mkdir('dingxiang_search_leg')
os.chdir('dingxiang_search_leg')

for j in range(len(article_url)):
    url = article_url[j]
    title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
                   "",article_title[j])+'.docx'
    output = pypandoc.convert_file(url,'docx','html',outputfile=title) 

# \u4e00-\u9fa5   汉字的unicode范围
# \u0030-\u0039   数字的unicode范围
# \u0041-\u005a   大写字母unicode范围
# \u0061-\u007a   小写字母unicode范围
# \uAC00-\uD7AF   韩文的unicode范围
# \u3040-\u31FF   日文的unicode范围
Multiple Pages

num = 1
pag=1
article_url = []
article_title = []
while pag:
    params_page={"page_index":str(num)}
    r_all = requests.get(all_url,params=params_page,headers=headers)
    soup_all = BeautifulSoup(r_all.content, 'lxml')
    pag = soup_all.find_all(attrs={"class":"pagination"})
    content_list = soup_all.find_all(attrs={"class":"article-title"})
    for i in range(len(content_list)):
        article_url.append(content_list[i]["href"])
        article_title.append(content_list[i].contents[0].string)
    num = num+1

14.2 Scraping with Selenium

豆瓣读书
Selenium

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import os

# 模拟浏览器搜索,获得html文件(更简单的方法)
browser = webdriver.Chrome()
browser.get("https://book.douban.com/")
input=browser.find_element_by_id("inp-query")
input.send_keys("Python")
button=browser.find_element_by_class_name("inp-btn")
button.click()
page = browser.page_source
browser.close()

# 解析html文件,获得书籍链接和名称
soup=BeautifulSoup(page, 'lxml')
list = soup.find_all(attrs={"class":"title-text"})
url_list=[]
title_list=[]
for i in list:
    url_list.append(i["href"])
    title_list.append(i.string)

# 创建文件夹
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/70.0.3538.25 \
        Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'}
exist=os.path.exists('douban_search') 
if not exist:
    os.mkdir('douban_search')
os.chdir('douban_search')

# 保存书籍封面
i=-1; fig=0
while not fig:
    i=i+1
    r=requests.get(url_list[i], headers=headers)
    book=BeautifulSoup(r.content, 'lxml')
    fig=book.find_all(attrs={"title":"点击看大图"})

position_1=fig[0]['src'].find("s/")
position_2=fig[0]['src'].find("/public")
src = fig[0]['src'][:position_1]+"l"+fig[0]['src'][position_2:]
fig_r=requests.get(src, headers=headers)
fig_title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
                        "",title_list[i])+".jpg"
with open(fig_title, "wb") as f:
    f.write(fig_r.content)

# 保存书籍简介
intro = book.find_all(attrs={"class":"intro"})
filename = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
                    "",title_list[i])+".txt"
for j in intro:
    with open(filename, 'a',encoding="utf-8") as file_object:
        file_object.write(j.get_text())         
       
# 保存书籍目录
position_3=url_list[i].find("subject/")
position_4=url_list[i].find("/",position_3+1)
position_5=url_list[i].find("/",position_4+1)
bookid="dir_"+url_list[i][position_4+1:position_5]+"_full"
dir=book.find_all(attrs={"id":bookid})
if dir:
    with open(filename, 'a',encoding="utf-8") as file_object:
        file_object.write(dir[0].get_text())
os.chdir('..') 
       
import time
import random

for i in range(len(url_list)):
    r=requests.get(url_list[i], headers=headers)
    time.sleep(random.random()*3)  
    book=BeautifulSoup(r.content, 'lxml')
    fig=book.find_all(attrs={"title":"点击看大图"})

# 这段从属于上面的For循环,需要整体缩进一次       
if fig:
    position_1=fig[0]['src'].find("s/")
    position_2=fig[0]['src'].find("/public")
    src = fig[0]['src'][:position_1]+"l"+fig[0]['src'][position_2:]
    fig_r=requests.get(src, headers=headers)
    #time.sleep(random.random()*3)
    fig_title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
                       "",title_list[i])+".jpg"
    with open(fig_title, "wb") as f:
        f.write(fig_r.content)
    intro = book.find_all(attrs={"class":"intro"})
    filename = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])",
                           "",title_list[i])+".txt"

# 这段从属于上面的if条件判断,需要整体缩进两次       
for j in intro:
    with open(filename, 'a',encoding="utf-8") as file_object:
        file_object.write(j.get_text())        
position_3=url_list[i].find("subject/")
position_4=url_list[i].find("/",position_3+1)
position_5=url_list[i].find("/",position_4+1)
bookid="dir_"+url_list[i][position_4+1:position_5]+"_full"
dir=book.find_all(attrs={"id":bookid})
if dir:
    with open(filename, 'a',encoding="utf-8") as file_object:
        file_object.write(dir[0].get_text())

os.chdir('..')
一些反爬策略
  • 通过User-Agent来控制访问
  • IP限制 (IP池,延时模拟人工)
  • SESSION访问限制
  • 蜘蛛陷阱
  • 验证码验证
  • 通过robots.txt来限制爬虫
  • 数据动态加载
  • 数据加密-使用加密算法

Summary

  • Python 3 网络爬虫开发实战