基于selenium的简单爬虫——爬取蚂蚁学习网

佚名 5年前 (2020-07-12) Python 953人围观抢沙发百度已收录

记录一下第二个爬取的试题网站，和上传爬取的问答库不同，这个网站连登录都不需要，捡到宝了。虽然大部分的问卷库都没有反爬机制，但是像这样不收费的网站属实罕见。
这次爬取的是蚂蚁学习网中的高中语文题库。
先把源码给上：

import json
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait  # 引用设定显示等待时间


def init():
    # 定义为全局变量，方便其他模块使用
    global url, browser, username, password, wait
    # 登录界面的url
    url = 'https://k12.mayi173.com/exam/list/3-1-0-1.html'
    # 实例化一个chrome浏览器
    browser = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
    # 用户名
    username = ""
    # 密码
    password = ""
    # 设置等待超时
    wait = WebDriverWait(browser, 20)


def get_item_info(url1):
    browser.get(url1)
    soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'lxml')
    question_type_list = soup.select('body > div.i-page > div.i-content.g-clear > div.i-left > div.i-panel.p-panel > '
                                     'div.header > label:nth-child(2)')
    question_type = question_type_list[0].text
    question_list = soup.select(
        'body > div.i-page > div.i-content.g-clear > div.i-left > div.i-panel.p-panel > '
        'div.content ')
    question = question_list[0].text
    answer_list = soup.select('#i-tab-content > div')
    answer = answer_list[0].text
    jiexi_list = soup.select(
        'body > div.i-page > div.i-content.g-clear > div.i-left > div:nth-child(3) > div:nth-child(2) > div')
    jiexi = jiexi_list[0].text
    data = {
        'question_type': question_type,
        'question': question,
        'answer': answer,
        'jiexi': jiexi
    }
    print(data)
    with open('mayi.json', 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def get_all_link():
    for page in range(1, 50):
        if page == 50:
            break****
        if page == 1:
            browser.get('https://k12.mayi173.com/exam/list/3-1-0-1.html')
            sleep(5)
        else:
            browser.get('https://k12.mayi173.com/exam/list/3-1-0-' + str(page) + '.html')
            sleep(5)
        soup = BeautifulSoup(browser.page_source.encode('utf-8'), 'lxml')
        hrefs_list = soup.select(
            'body > div.i-page > div.i-content.g-clear > div.i-right.p-right > div.p-results > div > div.footer > a')
        for href in hrefs_list:
            link = href.get('href')
            url1 = 'https://k12.mayi173.com/' + link
            get_item_info(url1)


def main():
    # 初始化
    init()
    sleep(15)
    get_all_link()


if __name__ == '__main__':
    main()

下面简单说一下爬取思路，其实与上一篇爬取问答库如出一辙，还更为简单

导入第三方库

import json
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait  # 引用设定显示等待时间

初始化 chromedriver

def init():
    # 定义为全局变量，方便其他模块使用
    global url, browser, username, password, wait
    # 登录界面的url
    url = 'https://k12.mayi173.com/exam/list/3-1-0-1.html'
    # 实例化一个chrome浏览器
    browser = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
    # 用户名
    username = ""
    # 密码
    password = ""
    # 设置等待超时
    wait = WebDriverWait(browser, 20)