zhenxun_bot/zhenxun/utils/http_utils.py

517 lines
18 KiB
Python
Raw Normal View History

2024-09-03 13:04:49 +08:00
import time
2024-02-25 03:18:34 +08:00
import asyncio
from pathlib import Path
2024-09-03 13:04:49 +08:00
from typing import Any, Literal, ClassVar
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from asyncio.exceptions import TimeoutError
2024-02-25 03:18:34 +08:00
import rich
2024-09-03 13:04:49 +08:00
import httpx
import aiofiles
from retrying import retry
from playwright.async_api import Page
2024-08-10 02:25:04 +08:00
from nonebot_plugin_alconna import UniMessage
2024-08-15 20:27:19 +08:00
from nonebot_plugin_htmlrender import get_browser
from httpx import Response, ConnectTimeout, HTTPStatusError
2024-02-25 03:18:34 +08:00
from zhenxun.services.log import logger
2024-09-03 13:04:49 +08:00
from zhenxun.configs.config import BotConfig
2024-08-10 02:25:04 +08:00
from zhenxun.utils.message import MessageUtils
2024-02-25 03:18:34 +08:00
from zhenxun.utils.user_agent import get_user_agent
2024-08-15 20:27:19 +08:00
# from .browser import get_browser
2024-02-25 03:18:34 +08:00
class AsyncHttpx:
2024-09-03 13:04:49 +08:00
proxy: ClassVar[dict[str, str | None]] = {
"http://": BotConfig.system_proxy,
"https://": BotConfig.system_proxy,
}
2024-02-25 03:18:34 +08:00
@classmethod
@retry(stop_max_attempt_number=3)
async def get(
cls,
url: str | list[str],
2024-02-25 03:18:34 +08:00
*,
2024-09-03 13:04:49 +08:00
params: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
cookies: dict[str, str] | None = None,
2024-02-25 03:18:34 +08:00
verify: bool = True,
use_proxy: bool = True,
2024-09-03 13:04:49 +08:00
proxy: dict[str, str] | None = None,
2024-02-25 03:18:34 +08:00
timeout: int = 30,
**kwargs,
) -> Response:
"""Get
参数:
url: url
params: params
headers: 请求头
cookies: cookies
verify: verify
use_proxy: 使用默认代理
proxy: 指定代理
timeout: 超时时间
"""
urls = [url] if isinstance(url, str) else url
return await cls._get_first_successful(
urls,
params=params,
headers=headers,
cookies=cookies,
verify=verify,
use_proxy=use_proxy,
proxy=proxy,
timeout=timeout,
**kwargs,
)
@classmethod
async def _get_first_successful(
cls,
urls: list[str],
**kwargs,
) -> Response:
last_exception = None
for url in urls:
try:
return await cls._get_single(url, **kwargs)
except Exception as e:
last_exception = e
if url != urls[-1]:
logger.warning(f"获取 {url} 失败, 尝试下一个")
raise last_exception or Exception("All URLs failed")
@classmethod
async def _get_single(
cls,
url: str,
*,
params: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
cookies: dict[str, str] | None = None,
verify: bool = True,
use_proxy: bool = True,
proxy: dict[str, str] | None = None,
timeout: int = 30,
**kwargs,
) -> Response:
2024-02-25 03:18:34 +08:00
if not headers:
headers = get_user_agent()
2024-10-19 23:02:05 +08:00
_proxy = proxy or (cls.proxy if use_proxy else None)
2024-02-25 03:18:34 +08:00
async with httpx.AsyncClient(proxies=_proxy, verify=verify) as client: # type: ignore
return await client.get(
url,
params=params,
headers=headers,
cookies=cookies,
timeout=timeout,
**kwargs,
)
2024-09-03 13:04:49 +08:00
@classmethod
async def head(
cls,
url: str,
*,
params: dict[str, Any] | None = None,
headers: dict[str, str] | None = None,
cookies: dict[str, str] | None = None,
verify: bool = True,
use_proxy: bool = True,
proxy: dict[str, str] | None = None,
timeout: int = 30,
**kwargs,
) -> Response:
"""Get
参数:
url: url
params: params
headers: 请求头
cookies: cookies
verify: verify
use_proxy: 使用默认代理
proxy: 指定代理
timeout: 超时时间
"""
if not headers:
headers = get_user_agent()
2024-10-19 23:02:05 +08:00
_proxy = proxy or (cls.proxy if use_proxy else None)
2024-09-03 13:04:49 +08:00
async with httpx.AsyncClient(proxies=_proxy, verify=verify) as client: # type: ignore
return await client.head(
url,
params=params,
headers=headers,
cookies=cookies,
timeout=timeout,
**kwargs,
)
2024-02-25 03:18:34 +08:00
@classmethod
async def post(
cls,
url: str,
*,
2024-09-03 13:04:49 +08:00
data: dict[str, Any] | None = None,
2024-02-25 03:18:34 +08:00
content: Any = None,
files: Any = None,
verify: bool = True,
use_proxy: bool = True,
2024-09-03 13:04:49 +08:00
proxy: dict[str, str] | None = None,
json: dict[str, Any] | None = None,
params: dict[str, str] | None = None,
headers: dict[str, str] | None = None,
cookies: dict[str, str] | None = None,
2024-02-25 03:18:34 +08:00
timeout: int = 30,
**kwargs,
) -> Response:
"""
说明:
Post
参数:
url: url
data: data
content: content
files: files
use_proxy: 是否默认代理
proxy: 指定代理
json: json
params: params
headers: 请求头
cookies: cookies
timeout: 超时时间
"""
if not headers:
headers = get_user_agent()
2024-10-19 23:02:05 +08:00
_proxy = proxy or (cls.proxy if use_proxy else None)
2024-02-25 03:18:34 +08:00
async with httpx.AsyncClient(proxies=_proxy, verify=verify) as client: # type: ignore
return await client.post(
url,
content=content,
data=data,
files=files,
json=json,
params=params,
headers=headers,
cookies=cookies,
timeout=timeout,
**kwargs,
)
2024-10-18 18:57:55 +08:00
@classmethod
async def get_content(cls, url: str, **kwargs) -> bytes | None:
res = await cls.get(url, **kwargs)
return res.content if res and res.status_code == 200 else None
2024-02-25 03:18:34 +08:00
@classmethod
async def download_file(
cls,
url: str | list[str],
2024-02-25 03:18:34 +08:00
path: str | Path,
*,
2024-09-03 13:04:49 +08:00
params: dict[str, str] | None = None,
2024-02-25 03:18:34 +08:00
verify: bool = True,
use_proxy: bool = True,
2024-09-03 13:04:49 +08:00
proxy: dict[str, str] | None = None,
headers: dict[str, str] | None = None,
cookies: dict[str, str] | None = None,
2024-02-25 03:18:34 +08:00
timeout: int = 30,
stream: bool = False,
2024-09-07 01:25:42 +08:00
follow_redirects: bool = True,
2024-02-25 03:18:34 +08:00
**kwargs,
) -> bool:
"""下载文件
参数:
url: url
path: 存储路径
params: params
verify: verify
use_proxy: 使用代理
proxy: 指定代理
headers: 请求头
cookies: cookies
timeout: 超时时间
stream: 是否使用流式下载流式写入+进度条适用于下载大文件
"""
if isinstance(path, str):
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
try:
for _ in range(3):
if not isinstance(url, list):
url = [url]
for u in url:
2024-02-25 03:18:34 +08:00
try:
if not stream:
response = await cls.get(
u,
2024-02-25 03:18:34 +08:00
params=params,
headers=headers,
cookies=cookies,
use_proxy=use_proxy,
proxy=proxy,
2024-02-25 03:18:34 +08:00
timeout=timeout,
follow_redirects=follow_redirects,
2024-02-25 03:18:34 +08:00
**kwargs,
)
response.raise_for_status()
content = response.content
async with aiofiles.open(path, "wb") as wf:
await wf.write(content)
logger.info(f"下载 {u} 成功.. Path{path.absolute()}")
else:
if not headers:
headers = get_user_agent()
2024-10-19 23:02:05 +08:00
_proxy = proxy or (cls.proxy if use_proxy else None)
async with httpx.AsyncClient(
proxies=_proxy, # type: ignore
verify=verify,
) as client:
async with client.stream(
"GET",
u,
params=params,
headers=headers,
cookies=cookies,
timeout=timeout,
follow_redirects=True,
**kwargs,
) as response:
response.raise_for_status()
2024-02-25 03:18:34 +08:00
logger.info(
f"开始下载 {path.name}.. "
f"Path: {path.absolute()}"
2024-02-25 03:18:34 +08:00
)
async with aiofiles.open(path, "wb") as wf:
total = int(
response.headers.get("Content-Length", 0)
)
with rich.progress.Progress( # type: ignore
rich.progress.TextColumn(path.name), # type: ignore
"[progress.percentage]{task.percentage:>3.0f}%", # type: ignore
rich.progress.BarColumn(bar_width=None), # type: ignore
rich.progress.DownloadColumn(), # type: ignore
rich.progress.TransferSpeedColumn(), # type: ignore
) as progress:
download_task = progress.add_task(
"Download",
2024-10-19 23:02:05 +08:00
total=total or None,
)
async for chunk in response.aiter_bytes():
await wf.write(chunk)
await wf.flush()
progress.update(
download_task,
completed=response.num_bytes_downloaded,
)
logger.info(
f"下载 {u} 成功.. "
f"Path{path.absolute()}"
)
2024-10-19 23:02:05 +08:00
return True
except (TimeoutError, ConnectTimeout, HTTPStatusError):
logger.warning(f"下载 {u} 失败.. 尝试下一个地址..")
2024-10-19 23:02:05 +08:00
logger.error(f"下载 {url} 下载超时.. Path{path.absolute()}")
2024-02-25 03:18:34 +08:00
except Exception as e:
logger.error(f"下载 {url} 错误 Path{path.absolute()}", e=e)
return False
@classmethod
async def gather_download_file(
cls,
url_list: list[str] | list[list[str]],
2024-02-25 03:18:34 +08:00
path_list: list[str | Path],
*,
limit_async_number: int | None = None,
2024-09-03 13:04:49 +08:00
params: dict[str, str] | None = None,
2024-02-25 03:18:34 +08:00
use_proxy: bool = True,
2024-09-03 13:04:49 +08:00
proxy: dict[str, str] | None = None,
headers: dict[str, str] | None = None,
cookies: dict[str, str] | None = None,
2024-02-25 03:18:34 +08:00
timeout: int = 30,
**kwargs,
) -> list[bool]:
"""分组同时下载文件
参数:
url_list: url列表
path_list: 存储路径列表
limit_async_number: 限制同时请求数量
params: params
use_proxy: 使用代理
proxy: 指定代理
headers: 请求头
cookies: cookies
timeout: 超时时间
"""
if n := len(url_list) != len(path_list):
raise UrlPathNumberNotEqual(
f"Url数量与Path数量不对等Url{len(url_list)}Path{len(path_list)}"
)
if limit_async_number and n > limit_async_number:
m = float(n) / limit_async_number
x = 0
j = limit_async_number
_split_url_list = []
_split_path_list = []
for _ in range(int(m)):
_split_url_list.append(url_list[x:j])
_split_path_list.append(path_list[x:j])
x += limit_async_number
j += limit_async_number
if int(m) < m:
_split_url_list.append(url_list[j:])
_split_path_list.append(path_list[j:])
else:
_split_url_list = [url_list]
_split_path_list = [path_list]
tasks = []
result_ = []
for x, y in zip(_split_url_list, _split_path_list):
for url, path in zip(x, y):
tasks.append(
asyncio.create_task(
cls.download_file(
url,
path,
params=params,
headers=headers,
cookies=cookies,
use_proxy=use_proxy,
timeout=timeout,
proxy=proxy,
**kwargs,
)
)
)
_x = await asyncio.gather(*tasks)
result_ = result_ + list(_x)
tasks.clear()
return result_
2024-09-03 13:04:49 +08:00
@classmethod
async def get_fastest_mirror(cls, url_list: list[str]) -> list[str]:
assert url_list
async def head_mirror(client: type[AsyncHttpx], url: str) -> dict[str, Any]:
begin_time = time.time()
response = await client.head(url=url, timeout=6)
elapsed_time = (time.time() - begin_time) * 1000
2024-09-03 17:09:55 +08:00
content_length = int(response.headers.get("content-length", 0))
2024-09-03 13:04:49 +08:00
return {
"url": url,
"elapsed_time": elapsed_time,
"content_length": content_length,
}
logger.debug(f"开始获取最快镜像,可能需要一段时间... | URL列表{url_list}")
results = await asyncio.gather(
*(head_mirror(cls, url) for url in url_list),
return_exceptions=True,
)
_results: list[dict[str, Any]] = []
for result in results:
if isinstance(result, BaseException):
logger.warning(f"获取镜像失败,错误:{result}")
else:
logger.debug(f"获取镜像成功,结果:{result}")
_results.append(result)
_results = sorted(iter(_results), key=lambda r: r["elapsed_time"])
return [result["url"] for result in _results]
2024-02-25 03:18:34 +08:00
class AsyncPlaywright:
@classmethod
@asynccontextmanager
2024-11-07 13:38:20 +08:00
async def new_page(
cls, cookies: list[dict[str, Any]] | dict[str, Any] | None = None, **kwargs
) -> AsyncGenerator[Page, None]:
2024-02-25 03:18:34 +08:00
"""获取一个新页面
参数:
2024-11-07 13:38:20 +08:00
cookies: cookies
2024-02-25 03:18:34 +08:00
"""
browser = await get_browser()
2024-02-25 03:18:34 +08:00
ctx = await browser.new_context(**kwargs)
2024-11-07 13:38:20 +08:00
if cookies:
if isinstance(cookies, dict):
cookies = [cookies]
await ctx.add_cookies(cookies) # type: ignore
2024-02-25 03:18:34 +08:00
page = await ctx.new_page()
try:
yield page
finally:
await page.close()
await ctx.close()
@classmethod
async def screenshot(
cls,
url: str,
path: Path | str,
element: str | list[str],
*,
wait_time: int | None = None,
2024-09-03 13:04:49 +08:00
viewport_size: dict[str, int] | None = None,
2024-02-25 03:18:34 +08:00
wait_until: (
Literal["domcontentloaded", "load", "networkidle"] | None
) = "networkidle",
timeout: float | None = None,
type_: Literal["jpeg", "png"] | None = None,
user_agent: str | None = None,
2024-11-07 13:38:20 +08:00
cookies: list[dict[str, Any]] | dict[str, Any] | None = None,
2024-02-25 03:18:34 +08:00
**kwargs,
2024-08-10 02:25:04 +08:00
) -> UniMessage | None:
2024-02-25 03:18:34 +08:00
"""截图,该方法仅用于简单快捷截图,复杂截图请操作 page
参数:
url: 网址
path: 存储路径
element: 元素选择
wait_time: 等待截取超时时间
viewport_size: 窗口大小
wait_until: 等待类型
timeout: 超时限制
type_: 保存类型
2024-11-07 13:38:20 +08:00
user_agent: user_agent
cookies: cookies
2024-02-25 03:18:34 +08:00
"""
if viewport_size is None:
2024-09-03 13:04:49 +08:00
viewport_size = {"width": 2560, "height": 1080}
2024-02-25 03:18:34 +08:00
if isinstance(path, str):
path = Path(path)
wait_time = wait_time * 1000 if wait_time else None
2024-10-19 23:02:05 +08:00
element_list = [element] if isinstance(element, str) else element
2024-02-25 03:18:34 +08:00
async with cls.new_page(
2024-11-07 13:38:20 +08:00
cookies,
2024-02-25 03:18:34 +08:00
viewport=viewport_size,
user_agent=user_agent,
**kwargs,
) as page:
await page.goto(url, timeout=timeout, wait_until=wait_until)
card = page
for e in element_list:
if not card:
return None
card = await card.wait_for_selector(e, timeout=wait_time)
if card:
await card.screenshot(path=path, timeout=timeout, type=type_)
2024-08-10 02:25:04 +08:00
return MessageUtils.build_message(path)
2024-02-25 03:18:34 +08:00
return None
class UrlPathNumberNotEqual(Exception):
pass
class BrowserIsNone(Exception):
pass