zhenxun_bot/util/get_bilibili_img.py
2021-06-15 10:57:08 +08:00

140 lines
4.9 KiB
Python

import requests
from util.user_agent import get_user_agent
from bs4 import BeautifulSoup
from time import sleep
import threading
import os
from configs.path_config import IMAGE_PATH
lock = threading.Lock()
url = "https://search.bilibili.com/article"
# path = IMAGE_PATH + "setu/"
index = 1
THREAD_SUM_REMAINDER = 2 # 越小线程越多
class bilibiliThread (threading.Thread):
def __init__(self, threadId, url_list, path, nolist):
threading.Thread.__init__(self)
self.threadId = threadId
self.url_list = url_list
self.path = path
self.nolist = nolist
def run(self):
print("开始线程<><><><><><><><><> " + self.threadId)
thread_get_url(self.threadId, self.url_list, self.path, self.nolist)
def get_bilibili_img(name, path, nolist=None):
global index
index = get_dirfile_len(path)
print("index===", index)
threadId = 1
params = {
'keyword': name,
'page': '1'
}
res = requests.get(url, headers=get_user_agent(), params=params)
sleep(8)
soup = BeautifulSoup(res.text, 'html.parser')
# print(soup.text)
try:
total_page = soup.find_all('button', {'class': 'pagination-btn'})[-1].text.strip()
print("1 try")
except:
try:
total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip()
print("2 try")
except:
total_page = 1
print("3 except")
print(total_page)
url_list = []
for page in range(1, int(total_page)+1):
url_r = "https://search.bilibili.com/article?keyword=" + name + "&page=" + str(page)
url_list.append(url_r)
if page % THREAD_SUM_REMAINDER == 0:
print('-----> ' + str(page) + " =======>", url_list)
# _thread.start_new_thread(thread_get_url, (url_list, path,))
bilibiliThread(str(threadId), url_list, path, nolist).start()
threadId += 1
sleep(0.5)
url_list = []
if url_list:
print("=========================最后一个线程启动========================= url数量: ", len(url_list))
bilibiliThread(str(threadId), url_list, path, nolist).start()
def thread_get_url(threadId, url_list, path, nolist):
for url in url_list:
res = requests.get(url, headers=get_user_agent())
sleep(2)
soup = BeautifulSoup(res.text, 'lxml')
alist = soup.find_all('a', {'class': 'poster'})
img_content_page = []
# print(alist)
for a in alist:
if nolist != None:
if a.get('href') not in nolist:
img_content_page.append("https://" + a.get('href')[2:])
else:
img_content_page.append("https://" + a.get('href')[2:])
pic_url = []
for img_content in img_content_page:
print("开始获取---------->", img_content)
res = requests.get(img_content, headers=get_user_agent())
sleep(2)
soup = BeautifulSoup(res.text, 'lxml')
figure_ls = soup.body.find_all('figure')
# print(figure_ls)
for figure in figure_ls:
try:
_ = figure.img.attrs['class']
except:
data_src = figure.img.attrs['data-src']
pic_url.append('https:' + data_src)
print("线程 " + threadId + " 获取完毕------> 开始存储")
for url in pic_url:
print("线程 " + threadId + "正在存储---------------->", url)
res = requests.get(url, headers=get_user_agent())
save_img(res.content, path, threadId)
pic_url = []
print("线程 " + threadId + " ---------------->执行完毕")
def save_img(img, path, threadId):
global index
try:
lock.acquire()
img_index = index
finally:
lock.release()
try:
with open(path + str(img_index) + ".jpg", 'wb') as f:
f.write(img)
lock.acquire()
index += 1
except:
print("线程 " + threadId + "存储失败-------->" + str(img_index) + ".jpg")
finally:
lock.release()
def get_dirfile_len(path):
return len(os.listdir(path))
if __name__ == '__main__':
# url = "https://search.bilibili.com" \
# "/article?keyword=%23%E4%BB%8A%E6%97%A5%E4%BB%BD%E7%9A%84%E5%8F%AF%E7%88%B1%" \
# "E5%B0%8F%E8%90%9D%E8%8E%89%EF%BC%8C%E8%BF%9B%E6%9D%A5%E7%9C%8B%E7%9C%8B%EF%BC%8C%E" \
# "6%8F%90%E7%A5%9E%E9%86%92%E8%84%91%EF%BC%81"
# res = requests.get(url, headers=get_user_agent())
# sleep(2)
# soup = BeautifulSoup(res.text, 'lxml')
# alist = soup.find_all('button', {'class': 'pagination-btn num-btn'})
# total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip()
# print(total_page)
get_bilibili_img("精选动漫壁纸手机电脑壁纸&动漫游戏专题", IMAGE_PATH + "bizhi/")