zhenxun_bot/util/get_bilibili_img.py

import requests
from util.user_agent import get_user_agent
from bs4 import BeautifulSoup
from time import sleep
import threading
import os
from configs.path_config import IMAGE_PATH

lock = threading.Lock()

url = "https://search.bilibili.com/article"
# path = IMAGE_PATH + "setu/"

index = 1
THREAD_SUM_REMAINDER = 2    # 越小线程越多


class bilibiliThread (threading.Thread):
    def __init__(self, threadId, url_list, path, nolist):
        threading.Thread.__init__(self)
        self.threadId = threadId
        self.url_list = url_list
        self.path = path
        self.nolist = nolist
    def run(self):
        print("开始线程<><><><><><><><><> " + self.threadId)
        thread_get_url(self.threadId, self.url_list, self.path, self.nolist)


def get_bilibili_img(name, path, nolist=None):
    global index
    index = get_dirfile_len(path)
    print("index===", index)
    threadId = 1
    params = {
        'keyword': name,
        'page': '1'
    }
    res = requests.get(url, headers=get_user_agent(), params=params)
    sleep(8)
    soup = BeautifulSoup(res.text, 'html.parser')
    # print(soup.text)
    try:
        total_page = soup.find_all('button', {'class': 'pagination-btn'})[-1].text.strip()
        print("1 try")
    except:
        try:
            total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip()
            print("2 try")
        except:
            total_page = 1
            print("3 except")
    print(total_page)
    url_list = []
    for page in range(1, int(total_page)+1):
        url_r = "https://search.bilibili.com/article?keyword=" + name + "&page=" + str(page)
        url_list.append(url_r)
        if page % THREAD_SUM_REMAINDER == 0:
            print('-----> ' + str(page) + " =======>", url_list)
            # _thread.start_new_thread(thread_get_url, (url_list, path,))
            bilibiliThread(str(threadId), url_list, path, nolist).start()
            threadId += 1
            sleep(0.5)
            url_list = []
    if url_list:
        print("=========================最后一个线程启动========================= url数量: ", len(url_list))
        bilibiliThread(str(threadId), url_list, path, nolist).start()


def thread_get_url(threadId, url_list, path, nolist):
    for url in url_list:
        res = requests.get(url, headers=get_user_agent())
        sleep(2)
        soup = BeautifulSoup(res.text, 'lxml')
        alist = soup.find_all('a', {'class': 'poster'})
        img_content_page = []
        # print(alist)
        for a in alist:
            if nolist != None:
                if a.get('href') not in nolist:
                    img_content_page.append("https://" + a.get('href')[2:])
            else:
                img_content_page.append("https://" + a.get('href')[2:])
        pic_url = []
        for img_content in img_content_page:
            print("开始获取---------->", img_content)
            res = requests.get(img_content, headers=get_user_agent())
            sleep(2)
            soup = BeautifulSoup(res.text, 'lxml')
            figure_ls = soup.body.find_all('figure')
            # print(figure_ls)
            for figure in figure_ls:
                try:
                    _ = figure.img.attrs['class']
                except:
                    data_src = figure.img.attrs['data-src']
                    pic_url.append('https:' + data_src)
            print("线程 " + threadId + " 获取完毕------>  开始存储")
            for url in pic_url:
                print("线程 " + threadId + "正在存储---------------->", url)
                res = requests.get(url, headers=get_user_agent())
                save_img(res.content, path, threadId)
            pic_url = []
    print("线程 " + threadId + " ---------------->执行完毕")


def save_img(img, path, threadId):
    global index
    try:
        lock.acquire()
        img_index = index
    finally:
        lock.release()
    try:
        with open(path + str(img_index) + ".jpg", 'wb') as f:
            f.write(img)
        lock.acquire()
        index += 1
    except:
        print("线程 " + threadId + "存储失败-------->" + str(img_index) + ".jpg")
    finally:
        lock.release()


def get_dirfile_len(path):
    return len(os.listdir(path))


if __name__ == '__main__':
    # url = "https://search.bilibili.com" \
    #       "/article?keyword=%23%E4%BB%8A%E6%97%A5%E4%BB%BD%E7%9A%84%E5%8F%AF%E7%88%B1%" \
    #       "E5%B0%8F%E8%90%9D%E8%8E%89%EF%BC%8C%E8%BF%9B%E6%9D%A5%E7%9C%8B%E7%9C%8B%EF%BC%8C%E" \
    #       "6%8F%90%E7%A5%9E%E9%86%92%E8%84%91%EF%BC%81"
    # res = requests.get(url, headers=get_user_agent())
    # sleep(2)
    # soup = BeautifulSoup(res.text, 'lxml')
    # alist = soup.find_all('button', {'class': 'pagination-btn num-btn'})
    # total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip()
    # print(total_page)
    get_bilibili_img("精选动漫壁纸手机电脑壁纸&动漫游戏专题", IMAGE_PATH + "bizhi/")
Add files via upload 2021-05-20 17:41:00 +08:00			`import requests`
			`from util.user_agent import get_user_agent`
			`from bs4 import BeautifulSoup`
			`from time import sleep`
			`import threading`
			`import os`
			`from configs.path_config import IMAGE_PATH`

			`lock = threading.Lock()`

			`url = "https://search.bilibili.com/article"`
			`# path = IMAGE_PATH + "setu/"`

			`index = 1`
			`THREAD_SUM_REMAINDER = 2 # 越小线程越多`


			`class bilibiliThread (threading.Thread):`
			`def __init__(self, threadId, url_list, path, nolist):`
			`threading.Thread.__init__(self)`
			`self.threadId = threadId`
			`self.url_list = url_list`
			`self.path = path`
			`self.nolist = nolist`
			`def run(self):`
			`print("开始线程<><><><><><><><><> " + self.threadId)`
			`thread_get_url(self.threadId, self.url_list, self.path, self.nolist)`


			`def get_bilibili_img(name, path, nolist=None):`
			`global index`
			`index = get_dirfile_len(path)`
			`print("index===", index)`
			`threadId = 1`
			`params = {`
			`'keyword': name,`
			`'page': '1'`
			`}`
			`res = requests.get(url, headers=get_user_agent(), params=params)`
			`sleep(8)`
			`soup = BeautifulSoup(res.text, 'html.parser')`
			`# print(soup.text)`
			`try:`
			`total_page = soup.find_all('button', {'class': 'pagination-btn'})[-1].text.strip()`
			`print("1 try")`
			`except:`
			`try:`
			`total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip()`
			`print("2 try")`
			`except:`
			`total_page = 1`
			`print("3 except")`
			`print(total_page)`
			`url_list = []`
			`for page in range(1, int(total_page)+1):`
			`url_r = "https://search.bilibili.com/article?keyword=" + name + "&page=" + str(page)`
			`url_list.append(url_r)`
			`if page % THREAD_SUM_REMAINDER == 0:`
			`print('-----> ' + str(page) + " =======>", url_list)`
			`# _thread.start_new_thread(thread_get_url, (url_list, path,))`
			`bilibiliThread(str(threadId), url_list, path, nolist).start()`
			`threadId += 1`
			`sleep(0.5)`
			`url_list = []`
			`if url_list:`
			`print("=========================最后一个线程启动========================= url数量: ", len(url_list))`
			`bilibiliThread(str(threadId), url_list, path, nolist).start()`


			`def thread_get_url(threadId, url_list, path, nolist):`
			`for url in url_list:`
			`res = requests.get(url, headers=get_user_agent())`
			`sleep(2)`
			`soup = BeautifulSoup(res.text, 'lxml')`
			`alist = soup.find_all('a', {'class': 'poster'})`
			`img_content_page = []`
			`# print(alist)`
			`for a in alist:`
			`if nolist != None:`
			`if a.get('href') not in nolist:`
			`img_content_page.append("https://" + a.get('href')[2:])`
			`else:`
			`img_content_page.append("https://" + a.get('href')[2:])`
			`pic_url = []`
			`for img_content in img_content_page:`
			`print("开始获取---------->", img_content)`
			`res = requests.get(img_content, headers=get_user_agent())`
			`sleep(2)`
			`soup = BeautifulSoup(res.text, 'lxml')`
			`figure_ls = soup.body.find_all('figure')`
			`# print(figure_ls)`
			`for figure in figure_ls:`
			`try:`
			`_ = figure.img.attrs['class']`
			`except:`
			`data_src = figure.img.attrs['data-src']`
			`pic_url.append('https:' + data_src)`
			`print("线程 " + threadId + " 获取完毕------> 开始存储")`
			`for url in pic_url:`
			`print("线程 " + threadId + "正在存储---------------->", url)`
			`res = requests.get(url, headers=get_user_agent())`
			`save_img(res.content, path, threadId)`
			`pic_url = []`
			`print("线程 " + threadId + " ---------------->执行完毕")`


			`def save_img(img, path, threadId):`
			`global index`
			`try:`
			`lock.acquire()`
			`img_index = index`
			`finally:`
			`lock.release()`
			`try:`
			`with open(path + str(img_index) + ".jpg", 'wb') as f:`
			`f.write(img)`
			`lock.acquire()`
			`index += 1`
			`except:`
			`print("线程 " + threadId + "存储失败-------->" + str(img_index) + ".jpg")`
			`finally:`
			`lock.release()`


			`def get_dirfile_len(path):`
			`return len(os.listdir(path))`


			`if __name__ == '__main__':`
			`# url = "https://search.bilibili.com" \`
			`# "/article?keyword=%23%E4%BB%8A%E6%97%A5%E4%BB%BD%E7%9A%84%E5%8F%AF%E7%88%B1%" \`
			`# "E5%B0%8F%E8%90%9D%E8%8E%89%EF%BC%8C%E8%BF%9B%E6%9D%A5%E7%9C%8B%E7%9C%8B%EF%BC%8C%E" \`
			`# "6%8F%90%E7%A5%9E%E9%86%92%E8%84%91%EF%BC%81"`
			`# res = requests.get(url, headers=get_user_agent())`
			`# sleep(2)`
			`# soup = BeautifulSoup(res.text, 'lxml')`
			`# alist = soup.find_all('button', {'class': 'pagination-btn num-btn'})`
			`# total_page = soup.find_all('button', {'class': 'pagination-btn num-btn'})[-1].text.strip()`
			`# print(total_page)`
			`get_bilibili_img("精选动漫壁纸手机电脑壁纸&动漫游戏专题", IMAGE_PATH + "bizhi/")`