2021-11-23 21:44:59 +08:00
|
|
|
|
import asyncio
|
2023-02-18 18:46:54 +08:00
|
|
|
|
from asyncio.exceptions import TimeoutError
|
|
|
|
|
|
from contextlib import asynccontextmanager
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
from typing import Any, AsyncGenerator, Dict, List, Literal, Optional, Union
|
|
|
|
|
|
|
2021-11-23 21:44:59 +08:00
|
|
|
|
import aiofiles
|
|
|
|
|
|
import httpx
|
2022-05-25 00:56:41 +08:00
|
|
|
|
import rich
|
2023-02-18 18:46:54 +08:00
|
|
|
|
from httpx import ConnectTimeout, Response
|
|
|
|
|
|
from nonebot.adapters.onebot.v11 import MessageSegment
|
|
|
|
|
|
from playwright.async_api import BrowserContext, Page
|
|
|
|
|
|
from retrying import retry
|
|
|
|
|
|
|
|
|
|
|
|
from services.log import logger
|
|
|
|
|
|
from utils.user_agent import get_user_agent
|
|
|
|
|
|
|
|
|
|
|
|
from .browser import get_browser
|
|
|
|
|
|
from .message_builder import image
|
|
|
|
|
|
from .utils import get_local_proxy
|
2021-11-23 21:44:59 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AsyncHttpx:
|
|
|
|
|
|
|
2021-12-01 15:20:32 +08:00
|
|
|
|
proxy = {"http://": get_local_proxy(), "https://": get_local_proxy()}
|
2021-11-23 21:44:59 +08:00
|
|
|
|
|
|
|
|
|
|
@classmethod
|
2021-12-01 14:03:34 +08:00
|
|
|
|
@retry(stop_max_attempt_number=3)
|
2021-11-23 21:44:59 +08:00
|
|
|
|
async def get(
|
|
|
|
|
|
cls,
|
|
|
|
|
|
url: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
params: Optional[Dict[str, Any]] = None,
|
|
|
|
|
|
headers: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
cookies: Optional[Dict[str, str]] = None,
|
2022-06-14 18:47:43 +08:00
|
|
|
|
verify: bool = True,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
use_proxy: bool = True,
|
2023-02-18 18:46:54 +08:00
|
|
|
|
proxy: Optional[Dict[str, str]] = None,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
timeout: Optional[int] = 30,
|
2021-12-16 11:16:28 +08:00
|
|
|
|
**kwargs,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
) -> Response:
|
|
|
|
|
|
"""
|
2022-07-23 14:09:17 +08:00
|
|
|
|
说明:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
Get
|
2022-07-23 14:09:17 +08:00
|
|
|
|
参数:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param url: url
|
|
|
|
|
|
:param params: params
|
|
|
|
|
|
:param headers: 请求头
|
|
|
|
|
|
:param cookies: cookies
|
2022-10-22 18:12:55 +08:00
|
|
|
|
:param verify: verify
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param use_proxy: 使用默认代理
|
|
|
|
|
|
:param proxy: 指定代理
|
|
|
|
|
|
:param timeout: 超时时间
|
2021-11-23 21:44:59 +08:00
|
|
|
|
"""
|
|
|
|
|
|
if not headers:
|
|
|
|
|
|
headers = get_user_agent()
|
2023-03-18 01:39:31 +08:00
|
|
|
|
proxy_ = proxy if proxy else cls.proxy if use_proxy else None
|
|
|
|
|
|
async with httpx.AsyncClient(proxies=proxy_, verify=verify) as client:
|
2021-11-23 21:44:59 +08:00
|
|
|
|
return await client.get(
|
|
|
|
|
|
url,
|
|
|
|
|
|
params=params,
|
|
|
|
|
|
headers=headers,
|
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
|
timeout=timeout,
|
2023-02-18 18:46:54 +08:00
|
|
|
|
**kwargs,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
async def post(
|
|
|
|
|
|
cls,
|
|
|
|
|
|
url: str,
|
|
|
|
|
|
*,
|
|
|
|
|
|
data: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
content: Any = None,
|
|
|
|
|
|
files: Any = None,
|
2022-06-14 18:47:43 +08:00
|
|
|
|
verify: bool = True,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
use_proxy: bool = True,
|
2023-03-18 01:39:31 +08:00
|
|
|
|
proxy: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
json: Optional[Dict[str, Any]] = None,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
params: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
headers: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
cookies: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
timeout: Optional[int] = 30,
|
2021-12-16 11:16:28 +08:00
|
|
|
|
**kwargs,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
) -> Response:
|
|
|
|
|
|
"""
|
2022-07-23 14:09:17 +08:00
|
|
|
|
说明:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
Post
|
2022-07-23 14:09:17 +08:00
|
|
|
|
参数:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param url: url
|
|
|
|
|
|
:param data: data
|
|
|
|
|
|
:param content: content
|
|
|
|
|
|
:param files: files
|
|
|
|
|
|
:param use_proxy: 是否默认代理
|
|
|
|
|
|
:param proxy: 指定代理
|
|
|
|
|
|
:param json: json
|
|
|
|
|
|
:param params: params
|
|
|
|
|
|
:param headers: 请求头
|
|
|
|
|
|
:param cookies: cookies
|
|
|
|
|
|
:param timeout: 超时时间
|
2021-11-23 21:44:59 +08:00
|
|
|
|
"""
|
|
|
|
|
|
if not headers:
|
|
|
|
|
|
headers = get_user_agent()
|
2023-03-18 01:39:31 +08:00
|
|
|
|
proxy_ = proxy if proxy else cls.proxy if use_proxy else None
|
|
|
|
|
|
async with httpx.AsyncClient(proxies=proxy_, verify=verify) as client:
|
2021-11-23 21:44:59 +08:00
|
|
|
|
return await client.post(
|
|
|
|
|
|
url,
|
|
|
|
|
|
content=content,
|
|
|
|
|
|
data=data,
|
|
|
|
|
|
files=files,
|
|
|
|
|
|
json=json,
|
|
|
|
|
|
params=params,
|
|
|
|
|
|
headers=headers,
|
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
|
timeout=timeout,
|
2021-12-16 11:16:28 +08:00
|
|
|
|
**kwargs,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
async def download_file(
|
|
|
|
|
|
cls,
|
|
|
|
|
|
url: str,
|
|
|
|
|
|
path: Union[str, Path],
|
|
|
|
|
|
*,
|
|
|
|
|
|
params: Optional[Dict[str, str]] = None,
|
2022-06-14 18:47:43 +08:00
|
|
|
|
verify: bool = True,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
use_proxy: bool = True,
|
2023-03-18 01:39:31 +08:00
|
|
|
|
proxy: Optional[Dict[str, str]] = None,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
headers: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
cookies: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
timeout: Optional[int] = 30,
|
2022-05-25 00:56:41 +08:00
|
|
|
|
stream: bool = False,
|
2021-12-16 11:16:28 +08:00
|
|
|
|
**kwargs,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
) -> bool:
|
|
|
|
|
|
"""
|
2022-07-23 14:09:17 +08:00
|
|
|
|
说明:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
下载文件
|
2022-07-23 14:09:17 +08:00
|
|
|
|
参数:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param url: url
|
|
|
|
|
|
:param path: 存储路径
|
|
|
|
|
|
:param params: params
|
2022-07-03 22:00:49 +08:00
|
|
|
|
:param verify: verify
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param use_proxy: 使用代理
|
|
|
|
|
|
:param proxy: 指定代理
|
|
|
|
|
|
:param headers: 请求头
|
|
|
|
|
|
:param cookies: cookies
|
|
|
|
|
|
:param timeout: 超时时间
|
2022-05-25 00:56:41 +08:00
|
|
|
|
:param stream: 是否使用流式下载(流式写入+进度条,适用于下载大文件)
|
2021-11-23 21:44:59 +08:00
|
|
|
|
"""
|
|
|
|
|
|
if isinstance(path, str):
|
|
|
|
|
|
path = Path(path)
|
|
|
|
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
try:
|
|
|
|
|
|
for _ in range(3):
|
2022-05-25 00:56:41 +08:00
|
|
|
|
if not stream:
|
|
|
|
|
|
try:
|
|
|
|
|
|
content = (
|
|
|
|
|
|
await cls.get(
|
|
|
|
|
|
url,
|
|
|
|
|
|
params=params,
|
|
|
|
|
|
headers=headers,
|
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
|
use_proxy=use_proxy,
|
|
|
|
|
|
proxy=proxy,
|
|
|
|
|
|
timeout=timeout,
|
|
|
|
|
|
**kwargs,
|
|
|
|
|
|
)
|
|
|
|
|
|
).content
|
|
|
|
|
|
async with aiofiles.open(path, "wb") as wf:
|
|
|
|
|
|
await wf.write(content)
|
|
|
|
|
|
logger.info(f"下载 {url} 成功.. Path:{path.absolute()}")
|
|
|
|
|
|
return True
|
|
|
|
|
|
except (TimeoutError, ConnectTimeout):
|
|
|
|
|
|
pass
|
|
|
|
|
|
else:
|
|
|
|
|
|
if not headers:
|
|
|
|
|
|
headers = get_user_agent()
|
2023-03-18 01:39:31 +08:00
|
|
|
|
proxy_ = proxy if proxy else cls.proxy if use_proxy else None
|
2022-05-25 00:56:41 +08:00
|
|
|
|
try:
|
2023-02-18 18:46:54 +08:00
|
|
|
|
async with httpx.AsyncClient(
|
2023-03-18 01:39:31 +08:00
|
|
|
|
proxies=proxy_, verify=verify
|
2023-02-18 18:46:54 +08:00
|
|
|
|
) as client:
|
2022-05-25 00:56:41 +08:00
|
|
|
|
async with client.stream(
|
|
|
|
|
|
"GET",
|
|
|
|
|
|
url,
|
|
|
|
|
|
params=params,
|
|
|
|
|
|
headers=headers,
|
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
|
timeout=timeout,
|
2023-02-18 18:46:54 +08:00
|
|
|
|
**kwargs,
|
2022-05-25 00:56:41 +08:00
|
|
|
|
) as response:
|
2023-02-18 18:46:54 +08:00
|
|
|
|
logger.info(
|
|
|
|
|
|
f"开始下载 {path.name}.. Path: {path.absolute()}"
|
|
|
|
|
|
)
|
2022-05-25 00:56:41 +08:00
|
|
|
|
async with aiofiles.open(path, "wb") as wf:
|
|
|
|
|
|
total = int(response.headers["Content-Length"])
|
|
|
|
|
|
with rich.progress.Progress(
|
|
|
|
|
|
rich.progress.TextColumn(path.name),
|
|
|
|
|
|
"[progress.percentage]{task.percentage:>3.0f}%",
|
|
|
|
|
|
rich.progress.BarColumn(bar_width=None),
|
|
|
|
|
|
rich.progress.DownloadColumn(),
|
2023-02-18 18:46:54 +08:00
|
|
|
|
rich.progress.TransferSpeedColumn(),
|
2022-05-25 00:56:41 +08:00
|
|
|
|
) as progress:
|
2023-02-18 18:46:54 +08:00
|
|
|
|
download_task = progress.add_task(
|
|
|
|
|
|
"Download", total=total
|
|
|
|
|
|
)
|
2022-05-25 00:56:41 +08:00
|
|
|
|
async for chunk in response.aiter_bytes():
|
|
|
|
|
|
await wf.write(chunk)
|
|
|
|
|
|
await wf.flush()
|
2023-02-18 18:46:54 +08:00
|
|
|
|
progress.update(
|
|
|
|
|
|
download_task,
|
|
|
|
|
|
completed=response.num_bytes_downloaded,
|
|
|
|
|
|
)
|
2022-05-25 00:56:41 +08:00
|
|
|
|
logger.info(f"下载 {url} 成功.. Path:{path.absolute()}")
|
|
|
|
|
|
return True
|
|
|
|
|
|
except (TimeoutError, ConnectTimeout):
|
|
|
|
|
|
pass
|
2021-11-23 21:44:59 +08:00
|
|
|
|
else:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
logger.error(f"下载 {url} 下载超时.. Path:{path.absolute()}")
|
2021-11-23 21:44:59 +08:00
|
|
|
|
except Exception as e:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
logger.error(f"下载 {url} 未知错误 {type(e)}:{e}.. Path:{path.absolute()}")
|
2021-11-23 21:44:59 +08:00
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
async def gather_download_file(
|
|
|
|
|
|
cls,
|
|
|
|
|
|
url_list: List[str],
|
|
|
|
|
|
path_list: List[Union[str, Path]],
|
|
|
|
|
|
*,
|
|
|
|
|
|
limit_async_number: Optional[int] = None,
|
|
|
|
|
|
params: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
use_proxy: bool = True,
|
2023-03-18 01:39:31 +08:00
|
|
|
|
proxy: Optional[Dict[str, str]] = None,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
headers: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
cookies: Optional[Dict[str, str]] = None,
|
|
|
|
|
|
timeout: Optional[int] = 30,
|
2021-12-16 11:16:28 +08:00
|
|
|
|
**kwargs,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
) -> List[bool]:
|
|
|
|
|
|
"""
|
2022-07-23 14:09:17 +08:00
|
|
|
|
说明:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
分组同时下载文件
|
2022-07-23 14:09:17 +08:00
|
|
|
|
参数:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param url_list: url列表
|
|
|
|
|
|
:param path_list: 存储路径列表
|
|
|
|
|
|
:param limit_async_number: 限制同时请求数量
|
|
|
|
|
|
:param params: params
|
|
|
|
|
|
:param use_proxy: 使用代理
|
|
|
|
|
|
:param proxy: 指定代理
|
|
|
|
|
|
:param headers: 请求头
|
|
|
|
|
|
:param cookies: cookies
|
|
|
|
|
|
:param timeout: 超时时间
|
2021-11-23 21:44:59 +08:00
|
|
|
|
"""
|
|
|
|
|
|
if n := len(url_list) != len(path_list):
|
|
|
|
|
|
raise UrlPathNumberNotEqual(
|
|
|
|
|
|
f"Url数量与Path数量不对等,Url:{len(url_list)},Path:{len(path_list)}"
|
|
|
|
|
|
)
|
|
|
|
|
|
if limit_async_number and n > limit_async_number:
|
|
|
|
|
|
m = float(n) / limit_async_number
|
|
|
|
|
|
x = 0
|
|
|
|
|
|
j = limit_async_number
|
|
|
|
|
|
_split_url_list = []
|
|
|
|
|
|
_split_path_list = []
|
|
|
|
|
|
for _ in range(int(m)):
|
|
|
|
|
|
_split_url_list.append(url_list[x:j])
|
|
|
|
|
|
_split_path_list.append(path_list[x:j])
|
|
|
|
|
|
x += limit_async_number
|
|
|
|
|
|
j += limit_async_number
|
|
|
|
|
|
if int(m) < m:
|
|
|
|
|
|
_split_url_list.append(url_list[j:])
|
|
|
|
|
|
_split_path_list.append(path_list[j:])
|
|
|
|
|
|
else:
|
|
|
|
|
|
_split_url_list = [url_list]
|
|
|
|
|
|
_split_path_list = [path_list]
|
|
|
|
|
|
tasks = []
|
|
|
|
|
|
result_ = []
|
|
|
|
|
|
for x, y in zip(_split_url_list, _split_path_list):
|
|
|
|
|
|
for url, path in zip(x, y):
|
|
|
|
|
|
tasks.append(
|
|
|
|
|
|
asyncio.create_task(
|
|
|
|
|
|
cls.download_file(
|
|
|
|
|
|
url,
|
|
|
|
|
|
path,
|
|
|
|
|
|
params=params,
|
|
|
|
|
|
headers=headers,
|
|
|
|
|
|
cookies=cookies,
|
|
|
|
|
|
use_proxy=use_proxy,
|
|
|
|
|
|
timeout=timeout,
|
2021-12-16 11:16:28 +08:00
|
|
|
|
proxy=proxy,
|
2023-02-18 18:46:54 +08:00
|
|
|
|
**kwargs,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
_x = await asyncio.gather(*tasks)
|
|
|
|
|
|
result_ = result_ + list(_x)
|
|
|
|
|
|
tasks.clear()
|
|
|
|
|
|
return result_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AsyncPlaywright:
|
|
|
|
|
|
@classmethod
|
2023-02-18 18:46:54 +08:00
|
|
|
|
@asynccontextmanager
|
|
|
|
|
|
async def new_page(cls, **kwargs) -> AsyncGenerator[Page, None]:
|
2021-11-23 21:44:59 +08:00
|
|
|
|
"""
|
2022-07-23 14:09:17 +08:00
|
|
|
|
说明:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
获取一个新页面
|
2022-07-23 14:09:17 +08:00
|
|
|
|
参数:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param user_agent: 请求头
|
2021-11-23 21:44:59 +08:00
|
|
|
|
"""
|
2023-02-18 18:46:54 +08:00
|
|
|
|
browser = get_browser()
|
|
|
|
|
|
ctx = await browser.new_context(**kwargs)
|
|
|
|
|
|
page = await ctx.new_page()
|
2021-11-23 21:44:59 +08:00
|
|
|
|
try:
|
2023-02-18 18:46:54 +08:00
|
|
|
|
yield page
|
|
|
|
|
|
finally:
|
|
|
|
|
|
await page.close()
|
|
|
|
|
|
await ctx.close()
|
2021-11-23 21:44:59 +08:00
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
|
async def screenshot(
|
|
|
|
|
|
cls,
|
|
|
|
|
|
url: str,
|
|
|
|
|
|
path: Union[Path, str],
|
2022-02-19 18:20:19 +08:00
|
|
|
|
element: Union[str, List[str]],
|
2021-11-23 21:44:59 +08:00
|
|
|
|
*,
|
2022-04-04 20:33:37 +08:00
|
|
|
|
wait_time: Optional[int] = None,
|
2023-02-18 18:46:54 +08:00
|
|
|
|
viewport_size: Optional[Dict[str, int]] = None,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
wait_until: Optional[
|
|
|
|
|
|
Literal["domcontentloaded", "load", "networkidle"]
|
|
|
|
|
|
] = "networkidle",
|
2023-02-18 18:46:54 +08:00
|
|
|
|
timeout: Optional[float] = None,
|
|
|
|
|
|
type_: Optional[Literal["jpeg", "png"]] = None,
|
|
|
|
|
|
**kwargs,
|
2021-11-23 21:44:59 +08:00
|
|
|
|
) -> Optional[MessageSegment]:
|
|
|
|
|
|
"""
|
2022-07-23 14:09:17 +08:00
|
|
|
|
说明:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
截图,该方法仅用于简单快捷截图,复杂截图请操作 page
|
2022-07-23 14:09:17 +08:00
|
|
|
|
参数:
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param url: 网址
|
|
|
|
|
|
:param path: 存储路径
|
|
|
|
|
|
:param element: 元素选择
|
2022-04-04 20:33:37 +08:00
|
|
|
|
:param wait_time: 等待截取超时时间
|
2021-12-16 11:16:28 +08:00
|
|
|
|
:param viewport_size: 窗口大小
|
|
|
|
|
|
:param wait_until: 等待类型
|
|
|
|
|
|
:param timeout: 超时限制
|
|
|
|
|
|
:param type_: 保存类型
|
2021-11-23 21:44:59 +08:00
|
|
|
|
"""
|
|
|
|
|
|
if viewport_size is None:
|
|
|
|
|
|
viewport_size = dict(width=2560, height=1080)
|
|
|
|
|
|
if isinstance(path, str):
|
|
|
|
|
|
path = Path(path)
|
2023-02-18 18:46:54 +08:00
|
|
|
|
wait_time = wait_time * 1000 if wait_time else None
|
|
|
|
|
|
if isinstance(element, str):
|
|
|
|
|
|
element_list = [element]
|
|
|
|
|
|
else:
|
|
|
|
|
|
element_list = element
|
|
|
|
|
|
async with cls.new_page(viewport=viewport_size) as page:
|
|
|
|
|
|
await page.goto(url, timeout=timeout, wait_until=wait_until)
|
|
|
|
|
|
card = page
|
|
|
|
|
|
for e in element_list:
|
|
|
|
|
|
if not card:
|
|
|
|
|
|
return None
|
|
|
|
|
|
card = await card.wait_for_selector(e, timeout=wait_time)
|
|
|
|
|
|
if card:
|
|
|
|
|
|
await card.screenshot(path=path, timeout=timeout, type=type_)
|
|
|
|
|
|
return image(path)
|
2021-11-23 21:44:59 +08:00
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class UrlPathNumberNotEqual(Exception):
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BrowserIsNone(Exception):
|
|
|
|
|
|
pass
|