Learning Man's Blog

pyppeteer 学习记录

字数统计: 390阅读时长: 2 min
2019/02/06 Share

保留几个问题

  1. 在python中$无法作为函数名出现,所以作者将其转换$ -> querySelector

selector

对于固定元素,可以通过chrome自带功能快速获取

小脚本

简易脚本,用于信息收集阶段,批量网页截图,国外有工具aquatone

# coding: utf-8
import asyncio
from pyppeteer import launch
from urllib.parse import urlparse
from optparse import OptionParser
from os import mkdir
from datetime import datetime


opts = OptionParser()
opts.add_option('-f', action='store', default="", type="string", dest="filename", help=u"目标地址文件路径")
opts.add_option('--https', action='store_true', default=False, dest="ssl", help=u"对无协议地址,是否使用https访问。默认:False")
opts.add_option('--full-page', action='store_true', default=False, dest="fullpage", help=u"全屏截图。默认:False")
opts.add_option('--time-wait', action='store', default=1000, type='int', dest="timewait", help=u"等待页面加载时间,视网速而定。默认:1000ms")
opts.add_option('--time-out', action='store', default=3000, type='int', dest="timeout", help=u"页面超时时间。默认:3000ms")


def getUrlFromFile(filename):
    with open(filename) as file:
        for line in file:
            yield line.strip()


async def getScreenshot(urlList, options):
    browser = await launch(headless=False)
    page = await browser.newPage()
    scheme = "https://" if options.ssl else "http://"
    for url in urlList:
        try:
            fullUrl = scheme + url if not len(urlparse(url)[0]) else url
            await page.goto(fullUrl, timeout=options.timeout)
            await page.waitFor(options.timewait)
            await page.screenshot({'path': '%s/%s.png' % (options.path, url), 'fullPage': options.fullpage})
        except Exception as e:
            print("[!]Warning: failure of screenshot %s" % url)
            # print(e)
            try:
                await page.close()
                await browser.close()
            except:
                pass
            browser = await launch(headless=False)
            page = await browser.newPage()


if __name__ == '__main__':
    try:
        options, args = opts.parse_args()
        if not len(options.filename):
            print(opts.print_help())
            exit(1)
        options.path = datetime.now().strftime('%Y.%m.%d %H:%M:%S')
        mkdir(options.path)
        urls = getUrlFromFile(options.filename)
        asyncio.get_event_loop().run_until_complete(getScreenshot(urls, options))
    except KeyboardInterrupt:
        print("[!] Exiting ~")
        exit(2)

效果

tl;dr

多动手编写就好了

CATALOG
  1. 保留几个问题
  2. selector
  3. 小脚本
  4. tl;dr