zhenxun_bot/plugins/update_setu/data_source.py

115 lines
6.0 KiB
Python
Raw Normal View History

2021-05-20 19:25:51 +08:00
from configs.path_config import IMAGE_PATH, TXT_PATH
import os
from util.user_agent import get_user_agent
from services.log import logger
from datetime import datetime
from util.img_utils import rar_imgs, get_img_hash
from util.utils import get_bot, get_local_proxy
from asyncio.exceptions import TimeoutError
import aiofiles
import aiohttp
2021-06-15 10:57:08 +08:00
from aiohttp.client_exceptions import ClientConnectorError
2021-05-20 19:25:51 +08:00
try:
import ujson as json
except ModuleNotFoundError:
import json
async def update_setu_img():
async with aiohttp.ClientSession(headers=get_user_agent()) as session:
2021-06-15 10:57:08 +08:00
for file_name in ['setu_url.json', 'setu_r18_url.json']:
if file_name == 'setu_url.json':
json_name = 'setu_data.json'
path = '_setu/'
rar_path = 'setu_rar/'
2021-05-20 19:25:51 +08:00
else:
2021-06-15 10:57:08 +08:00
json_name = 'r18_setu_data.json'
path = '_r18/'
rar_path = 'r18_rar/'
if not os.path.exists(IMAGE_PATH + path):
os.mkdir(IMAGE_PATH + path)
if not os.path.exists(IMAGE_PATH + rar_path):
os.mkdir(IMAGE_PATH + rar_path)
2021-05-20 19:25:51 +08:00
try:
2021-06-15 10:57:08 +08:00
data = json.load(open(TXT_PATH + json_name, encoding='utf8'))
2021-05-20 19:25:51 +08:00
if not data:
2021-06-15 10:57:08 +08:00
data = {}
2021-05-20 19:25:51 +08:00
except (FileNotFoundError, TypeError):
2021-06-15 10:57:08 +08:00
data = {}
2021-05-20 19:25:51 +08:00
_success = 0
_similar = 0
try:
2021-06-15 10:57:08 +08:00
with open(TXT_PATH + file_name, 'r', encoding='utf8') as f:
txt_data = json.load(f)
2021-05-20 19:25:51 +08:00
if not txt_data:
continue
2021-06-15 10:57:08 +08:00
except (FileNotFoundError, ValueError):
2021-05-20 19:25:51 +08:00
continue
2021-06-15 10:57:08 +08:00
total = len(txt_data)
urls = [data[x]['img_url'] for x in data.keys()]
for pid in txt_data.keys():
2021-05-20 19:25:51 +08:00
index = str(len(os.listdir(IMAGE_PATH + path)))
2021-06-15 10:57:08 +08:00
url = txt_data[pid]["img_url"].replace('img-master', 'img-original').replace('_master1200', '')
if url in urls or txt_data[pid]["img_url"] in urls:
continue
2021-05-20 19:25:51 +08:00
logger.info(f'开始更新 index:{index} --> {url}')
for _ in range(3):
try:
async with session.get(url, proxy=get_local_proxy(), timeout=15) as response:
if response.status == 200:
2021-06-15 10:57:08 +08:00
async with aiofiles.open(IMAGE_PATH + rar_path + index + ".jpg", 'wb') as f:
2021-05-20 19:25:51 +08:00
await f.write(await response.read())
_success += 1
else:
2021-06-15 10:57:08 +08:00
logger.info(f'{url} 不存在使用更新原url')
url = txt_data[pid]["img_url"]
async with session.get(txt_data[pid]["img_url"], proxy=get_local_proxy(),
timeout=15) as response:
if response.status == 200:
async with aiofiles.open(IMAGE_PATH + rar_path + index + ".jpg", 'wb') as f:
await f.write(await response.read())
_success += 1
if os.path.getsize(IMAGE_PATH + rar_path + str(index) + ".jpg") > 1024 * 1024 * 1.5:
2021-05-20 19:25:51 +08:00
rar_imgs(
2021-06-15 10:57:08 +08:00
rar_path,
2021-05-20 19:25:51 +08:00
path,
in_file_name=index,
out_file_name=index
)
else:
2021-06-15 10:57:08 +08:00
logger.info('不需要压缩,移动图片 ' + IMAGE_PATH + rar_path + index + ".jpg --> "
2021-05-20 19:25:51 +08:00
+ IMAGE_PATH + path + index + ".jpg")
2021-06-15 10:57:08 +08:00
os.rename(IMAGE_PATH + rar_path + index + ".jpg",
2021-05-20 19:25:51 +08:00
IMAGE_PATH + path + index + ".jpg")
img_hash = str(get_img_hash(f'{IMAGE_PATH}{path}{index}.jpg'))
2021-06-15 10:57:08 +08:00
if img_hash in [data[x]['img_hash'] for x in data.keys()]:
2021-05-20 19:25:51 +08:00
logger.info(f'index:{index}'
2021-06-15 10:57:08 +08:00
f'{[data[x]["img_hash"] for x in data.keys()].index(img_hash)} 存在重复,删除')
2021-05-20 19:25:51 +08:00
os.remove(IMAGE_PATH + path + index + ".jpg")
_similar += 1
2021-06-15 10:57:08 +08:00
else:
data[index] = {
'title': txt_data[pid]['title'],
'author': txt_data[pid]['author'],
'pid': txt_data[pid]['pid'],
'img_hash': img_hash,
'img_url': url,
'tags': txt_data[pid]['tags'],
}
2021-05-20 19:25:51 +08:00
break
2021-06-15 10:57:08 +08:00
except (TimeoutError, ClientConnectorError) as e:
logger.warning(f'{url} 更新失败 ..{type(e)}{e}')
2021-05-20 19:25:51 +08:00
continue
with open(TXT_PATH + json_name, 'w', encoding='utf-8') as f:
2021-06-15 10:57:08 +08:00
json.dump(data, f, indent=4, ensure_ascii=False)
2021-05-20 19:25:51 +08:00
open(TXT_PATH + file_name, 'w')
logger.info(
2021-06-15 10:57:08 +08:00
f'{str(datetime.now()).split(".")[0]} 更新 {file_name.split(".")[0]}完成,预计更新 {total} 张,'
f'实际更新 {_success} 张,相似 {_similar} 张,实际存入 {_success - _similar}')
2021-05-20 19:25:51 +08:00
await get_bot().send_private_msg(
2021-06-16 18:41:31 +08:00
user_id=int(list(get_bot().config.superusers)[0]),
2021-06-15 10:57:08 +08:00
message=f'{str(datetime.now()).split(".")[0]} 更新{file_name.split(".")[0]}完成,预计更新 {total} 张,'
f'实际更新 {_success} 张,相似 {_similar} 张,实际存入 {_success - _similar}'
2021-05-20 19:25:51 +08:00
)