mirror of
https://github.com/zhenxun-org/zhenxun_bot.git
synced 2025-12-15 06:12:53 +08:00
Delete get_bilibili_img.py
This commit is contained in:
parent
a85290e7b0
commit
9bc4419517
@ -1,140 +0,0 @@
|
||||
import requests
|
||||
from util.user_agent import get_user_agent
|
||||
from bs4 import BeautifulSoup
|
||||
from time import sleep
|
||||
import threading
|
||||
import os
|
||||
from configs.path_config import IMAGE_PATH
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
url = "https://search.bilibili.com/article"
|
||||
# path = IMAGE_PATH + "setu/"
|
||||
|
||||
index = 1
|
||||
THREAD_SUM_REMAINDER = 2 # 越小线程越多
|
||||
|
||||
|
||||
class bilibiliThread (threading.Thread):
|
||||
def __init__(self, threadId, url_list, path, nolist):
|
||||
threading.Thread.__init__(self)
|
||||
self.threadId = threadId
|
||||
self.url_list = url_list
|
||||
self.path = path
|
||||
self.nolist = nolist
|
||||
def run(self):
|
||||
print("开始线程<><><><><><><><><> " + self.threadId)
|
||||
thread_get_url(self.threadId, self.url_list, self.path, self.nolist)
|
||||
|
||||
|
||||
def get_bilibili_img(name, path, nolist=None):
|
||||
global index
|
||||
index = get_dirfile_len(path)
|
||||
print("index===", index)
|
||||
threadId = 1
|
||||
params = {
|
||||
'keyword': name,
|
||||
'page': '1'
|
||||
}
|
||||
res = requests.get(url, headers=get_user_agent(), params=params)
|
||||
sleep(8)
|
||||
soup = BeautifulSoup(res.text, 'html.parser')
|
||||
# print(soup.text)
|
||||
try:
|
||||
total_page = soup.find_all('button', {'class': 'pagination-btn'})[-1].text.strip()
|
||||
print("1 try")
|
||||
except:
|
||||
try:
|
||||
total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip()
|
||||
print("2 try")
|
||||
except:
|
||||
total_page = 1
|
||||
print("3 except")
|
||||
print(total_page)
|
||||
url_list = []
|
||||
for page in range(1, int(total_page)+1):
|
||||
url_r = "https://search.bilibili.com/article?keyword=" + name + "&page=" + str(page)
|
||||
url_list.append(url_r)
|
||||
if page % THREAD_SUM_REMAINDER == 0:
|
||||
print('-----> ' + str(page) + " =======>", url_list)
|
||||
# _thread.start_new_thread(thread_get_url, (url_list, path,))
|
||||
bilibiliThread(str(threadId), url_list, path, nolist).start()
|
||||
threadId += 1
|
||||
sleep(0.5)
|
||||
url_list = []
|
||||
if url_list:
|
||||
print("=========================最后一个线程启动========================= url数量: ", len(url_list))
|
||||
bilibiliThread(str(threadId), url_list, path, nolist).start()
|
||||
|
||||
|
||||
def thread_get_url(threadId, url_list, path, nolist):
|
||||
for url in url_list:
|
||||
res = requests.get(url, headers=get_user_agent())
|
||||
sleep(2)
|
||||
soup = BeautifulSoup(res.text, 'lxml')
|
||||
alist = soup.find_all('a', {'class': 'poster'})
|
||||
img_content_page = []
|
||||
# print(alist)
|
||||
for a in alist:
|
||||
if nolist != None:
|
||||
if a.get('href') not in nolist:
|
||||
img_content_page.append("https://" + a.get('href')[2:])
|
||||
else:
|
||||
img_content_page.append("https://" + a.get('href')[2:])
|
||||
pic_url = []
|
||||
for img_content in img_content_page:
|
||||
print("开始获取---------->", img_content)
|
||||
res = requests.get(img_content, headers=get_user_agent())
|
||||
sleep(2)
|
||||
soup = BeautifulSoup(res.text, 'lxml')
|
||||
figure_ls = soup.body.find_all('figure')
|
||||
# print(figure_ls)
|
||||
for figure in figure_ls:
|
||||
try:
|
||||
_ = figure.img.attrs['class']
|
||||
except:
|
||||
data_src = figure.img.attrs['data-src']
|
||||
pic_url.append('https:' + data_src)
|
||||
print("线程 " + threadId + " 获取完毕------> 开始存储")
|
||||
for url in pic_url:
|
||||
print("线程 " + threadId + "正在存储---------------->", url)
|
||||
res = requests.get(url, headers=get_user_agent())
|
||||
save_img(res.content, path, threadId)
|
||||
pic_url = []
|
||||
print("线程 " + threadId + " ---------------->执行完毕")
|
||||
|
||||
|
||||
def save_img(img, path, threadId):
|
||||
global index
|
||||
try:
|
||||
lock.acquire()
|
||||
img_index = index
|
||||
finally:
|
||||
lock.release()
|
||||
try:
|
||||
with open(path + str(img_index) + ".jpg", 'wb') as f:
|
||||
f.write(img)
|
||||
lock.acquire()
|
||||
index += 1
|
||||
except:
|
||||
print("线程 " + threadId + "存储失败-------->" + str(img_index) + ".jpg")
|
||||
finally:
|
||||
lock.release()
|
||||
|
||||
|
||||
def get_dirfile_len(path):
|
||||
return len(os.listdir(path))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# url = "https://search.bilibili.com" \
|
||||
# "/article?keyword=%23%E4%BB%8A%E6%97%A5%E4%BB%BD%E7%9A%84%E5%8F%AF%E7%88%B1%" \
|
||||
# "E5%B0%8F%E8%90%9D%E8%8E%89%EF%BC%8C%E8%BF%9B%E6%9D%A5%E7%9C%8B%E7%9C%8B%EF%BC%8C%E" \
|
||||
# "6%8F%90%E7%A5%9E%E9%86%92%E8%84%91%EF%BC%81"
|
||||
# res = requests.get(url, headers=get_user_agent())
|
||||
# sleep(2)
|
||||
# soup = BeautifulSoup(res.text, 'lxml')
|
||||
# alist = soup.find_all('button', {'class': 'pagination-btn num-btn'})
|
||||
# total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip()
|
||||
# print(total_page)
|
||||
get_bilibili_img("精选动漫壁纸手机电脑壁纸&动漫游戏专题", IMAGE_PATH + "bizhi/")
|
||||
Loading…
Reference in New Issue
Block a user