From 9bc441951782e44709eabc4a98a7de6fe78df6d2 Mon Sep 17 00:00:00 2001 From: HibiKier <45528451+HibiKier@users.noreply.github.com> Date: Thu, 20 May 2021 20:51:47 +0800 Subject: [PATCH] Delete get_bilibili_img.py --- util/get_bilibili_img.py | 140 --------------------------------------- 1 file changed, 140 deletions(-) delete mode 100644 util/get_bilibili_img.py diff --git a/util/get_bilibili_img.py b/util/get_bilibili_img.py deleted file mode 100644 index 19348e0b..00000000 --- a/util/get_bilibili_img.py +++ /dev/null @@ -1,140 +0,0 @@ -import requests -from util.user_agent import get_user_agent -from bs4 import BeautifulSoup -from time import sleep -import threading -import os -from configs.path_config import IMAGE_PATH - -lock = threading.Lock() - -url = "https://search.bilibili.com/article" -# path = IMAGE_PATH + "setu/" - -index = 1 -THREAD_SUM_REMAINDER = 2 # 越小线程越多 - - -class bilibiliThread (threading.Thread): - def __init__(self, threadId, url_list, path, nolist): - threading.Thread.__init__(self) - self.threadId = threadId - self.url_list = url_list - self.path = path - self.nolist = nolist - def run(self): - print("开始线程<><><><><><><><><> " + self.threadId) - thread_get_url(self.threadId, self.url_list, self.path, self.nolist) - - -def get_bilibili_img(name, path, nolist=None): - global index - index = get_dirfile_len(path) - print("index===", index) - threadId = 1 - params = { - 'keyword': name, - 'page': '1' - } - res = requests.get(url, headers=get_user_agent(), params=params) - sleep(8) - soup = BeautifulSoup(res.text, 'html.parser') - # print(soup.text) - try: - total_page = soup.find_all('button', {'class': 'pagination-btn'})[-1].text.strip() - print("1 try") - except: - try: - total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip() - print("2 try") - except: - total_page = 1 - print("3 except") - print(total_page) - url_list = [] - for page in range(1, int(total_page)+1): - url_r = "https://search.bilibili.com/article?keyword=" + name + "&page=" + str(page) - url_list.append(url_r) - if page % THREAD_SUM_REMAINDER == 0: - print('-----> ' + str(page) + " =======>", url_list) - # _thread.start_new_thread(thread_get_url, (url_list, path,)) - bilibiliThread(str(threadId), url_list, path, nolist).start() - threadId += 1 - sleep(0.5) - url_list = [] - if url_list: - print("=========================最后一个线程启动========================= url数量: ", len(url_list)) - bilibiliThread(str(threadId), url_list, path, nolist).start() - - -def thread_get_url(threadId, url_list, path, nolist): - for url in url_list: - res = requests.get(url, headers=get_user_agent()) - sleep(2) - soup = BeautifulSoup(res.text, 'lxml') - alist = soup.find_all('a', {'class': 'poster'}) - img_content_page = [] - # print(alist) - for a in alist: - if nolist != None: - if a.get('href') not in nolist: - img_content_page.append("https://" + a.get('href')[2:]) - else: - img_content_page.append("https://" + a.get('href')[2:]) - pic_url = [] - for img_content in img_content_page: - print("开始获取---------->", img_content) - res = requests.get(img_content, headers=get_user_agent()) - sleep(2) - soup = BeautifulSoup(res.text, 'lxml') - figure_ls = soup.body.find_all('figure') - # print(figure_ls) - for figure in figure_ls: - try: - _ = figure.img.attrs['class'] - except: - data_src = figure.img.attrs['data-src'] - pic_url.append('https:' + data_src) - print("线程 " + threadId + " 获取完毕------> 开始存储") - for url in pic_url: - print("线程 " + threadId + "正在存储---------------->", url) - res = requests.get(url, headers=get_user_agent()) - save_img(res.content, path, threadId) - pic_url = [] - print("线程 " + threadId + " ---------------->执行完毕") - - -def save_img(img, path, threadId): - global index - try: - lock.acquire() - img_index = index - finally: - lock.release() - try: - with open(path + str(img_index) + ".jpg", 'wb') as f: - f.write(img) - lock.acquire() - index += 1 - except: - print("线程 " + threadId + "存储失败-------->" + str(img_index) + ".jpg") - finally: - lock.release() - - -def get_dirfile_len(path): - return len(os.listdir(path)) - - -if __name__ == '__main__': - # url = "https://search.bilibili.com" \ - # "/article?keyword=%23%E4%BB%8A%E6%97%A5%E4%BB%BD%E7%9A%84%E5%8F%AF%E7%88%B1%" \ - # "E5%B0%8F%E8%90%9D%E8%8E%89%EF%BC%8C%E8%BF%9B%E6%9D%A5%E7%9C%8B%E7%9C%8B%EF%BC%8C%E" \ - # "6%8F%90%E7%A5%9E%E9%86%92%E8%84%91%EF%BC%81" - # res = requests.get(url, headers=get_user_agent()) - # sleep(2) - # soup = BeautifulSoup(res.text, 'lxml') - # alist = soup.find_all('button', {'class': 'pagination-btn num-btn'}) - # total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip() - # print(total_page) - get_bilibili_img("精选动漫壁纸手机电脑壁纸&动漫游戏专题", IMAGE_PATH + "bizhi/") \ No newline at end of file