Playwright访问爬页面

Playwright访问爬页面

代码

# -*- coding: utf-8 -*-
import sys
import os
import traceback
from playwright.sync_api import sync_playwright

def do_req( url ) :
    # 浏览器属性
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    args=[  "--width=1920",
            "--height=1080",
            "--use-gl=egl" ]

    with sync_playwright() as playwright :
        # 创建chromium浏览器的异步服务
        chromium = playwright.chromium
        # 创建浏览器句柄
        browser = chromium.launch(timeout=40000, args=args)

        try:
            # 创建上下文句柄
            cxt = browser.new_context(user_agent=user_agent, ignore_https_errors=True, viewport={"width" : 1920, "height" : 1080})
            # 创建页面句柄
            page = cxt.new_page()
            # 访问
            resp = page.goto(url, timeout=40000)
            # 解析
            headers = resp.headers
            print("\n=======================")
            print(headers)

            security_info = resp.security_details()
            print("\n=======================")
            print(security_info)

            html_content = page.content()            # 原始 html 数据
            print("\n=======================")
            print(html_content)

            text_content = page.inner_text('body')   # 解析后的 文本 数据
            print("\n=======================")
            print(text_content)

            title = page.title()                     # 页面标题
            print("\n=======================")
            print(title)

            urlfinal = page.url                            # 最终访问的 URL
            print("\n=======================")
            print(urlfinal)

            screenshot = page.screenshot(path="/tmp/xxx.png" )                      # 截图
            print("\n=======================")
            print(len(screenshot))

        except :
            print(traceback.format_exc())

if __name__ == "__main__":
    do_req("https://www.madbull.site/?page_id=2")

测试:

评论

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注