代码
# -*- coding: utf-8 -*-
import sys
import os
import traceback
from playwright.sync_api import sync_playwright
def do_req( url ) :
# 浏览器属性
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
args=[ "--width=1920",
"--height=1080",
"--use-gl=egl" ]
with sync_playwright() as playwright :
# 创建chromium浏览器的异步服务
chromium = playwright.chromium
# 创建浏览器句柄
browser = chromium.launch(timeout=40000, args=args)
try:
# 创建上下文句柄
cxt = browser.new_context(user_agent=user_agent, ignore_https_errors=True, viewport={"width" : 1920, "height" : 1080})
# 创建页面句柄
page = cxt.new_page()
# 访问
resp = page.goto(url, timeout=40000)
# 解析
headers = resp.headers
print("\n=======================")
print(headers)
security_info = resp.security_details()
print("\n=======================")
print(security_info)
html_content = page.content() # 原始 html 数据
print("\n=======================")
print(html_content)
text_content = page.inner_text('body') # 解析后的 文本 数据
print("\n=======================")
print(text_content)
title = page.title() # 页面标题
print("\n=======================")
print(title)
urlfinal = page.url # 最终访问的 URL
print("\n=======================")
print(urlfinal)
screenshot = page.screenshot(path="/tmp/xxx.png" ) # 截图
print("\n=======================")
print(len(screenshot))
except :
print(traceback.format_exc())
if __name__ == "__main__":
do_req("https://www.madbull.site/?page_id=2")
测试:

发表回复