diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..35410ca --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/main.py b/main.py index 76ac9ce..c768464 100644 --- a/main.py +++ b/main.py @@ -3,15 +3,26 @@ from selenium.webdriver.support.ui import WebDriverWait import time import run +download_dir = r'E:\Repository\python-selenium-spider\download' + +proxies = { + "http": "socks5://127.0.0.1:1080", + "https": "socks5://127.0.0.1:1080", +} + if __name__ == '__main__': options = webdriver.ChromeOptions() options.add_argument('lang=zh-CN') options.add_argument( 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36') - options.add_argument(r'--user-data-dir=E:\Repository\skcks.cn\python-selenium-spider\tmp\UserData') + options.add_argument(r'--user-data-dir=E:\Repository\python-selenium-spider\tmp\UserData') options.add_argument('--disable-gpu') # 如果不加这个选项,有时定位会出现问题 # options.add_argument('--headless') # 增加无界面选项 + options.add_experimental_option("prefs", { + "download.default_directory": download_dir + }) + service = webdriver.ChromeService(r"./driver/chromedriver.exe") driver = webdriver.Chrome(service=service,options=options) # driver.maximize_window() @@ -29,7 +40,8 @@ if __name__ == '__main__': "source": f.read() }) - run.main(driver=driver) - time.sleep(5) + run.main(driver=driver, download_dir=download_dir, proxies=proxies) + time.sleep(86400) finally: - driver.quit() \ No newline at end of file + pass + # driver.quit() diff --git a/requirements.txt b/requirements.txt index c3da3bd..25e970e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -Pillow==10.2.0 -selenium==4.18.1 +Pillow==10.3.0 +selenium==4.20.0 diff --git a/run/__init__.py b/run/__init__.py index 66a5744..20bb90e 100644 --- a/run/__init__.py +++ b/run/__init__.py @@ -2,15 +2,162 @@ from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait import time from selenium.webdriver.remote.webdriver import WebDriver +from selenium.webdriver.common.by import By +import requests +import os +from concurrent.futures import ThreadPoolExecutor -import utils +# def main(driver:WebDriver): +# driver.set_window_size(1920, 1080) +# # driver.get("https://bot.sannysoft.com/")\ +# driver.get("https://www.geetest.com/adaptive-captcha-demo") +# width = driver.execute_script("return document.documentElement.scrollWidth") +# height = driver.execute_script("return document.documentElement.scrollHeight") +# print(width, height) +# utils.screenshot(driver=driver, save_path="./tmp/geetest.png") -def main(driver:WebDriver): +workers = os.cpu_count() + + +def find_link(driver: WebDriver, url: str): + driver.get(url) + time.sleep(1) + cards = driver.find_element(By.XPATH, "//div[contains(@class,'CreatorPostList__CardsWrapper')]") + items = cards.find_elements(By.XPATH, "//a[contains(@class,'CardPostItem__Wrapper')]") + + links = [] + for item in items: + print(item.get_attribute("href")) + links.append(item.get_attribute("href")) + + return links + + +def has_next(driver: WebDriver): + has_next = False + try: + next = driver.find_element(By.XPATH, "//pixiv-icon[@name='24/Next']") + if next is not None: + has_next = True + except Exception as e: + print(e) + has_next = False + + return has_next + + +def download_file_link(driver: WebDriver, links): + for link in links: + print(link) + driver.get(link) + + time.sleep(1) + # 获取所有打开的窗口句柄 + # all_windows = driver.window_handles + # for window in all_windows: + # if window != original_window: + # driver.switch_to.window(window) + try: + el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]") + if el is not None: + print(el) + a = el.find_element(By.TAG_NAME, "a") + if a is not None: + a.click() + except Exception as e: + print(e) + time.sleep(5) + # driver.back() + # driver.switch_to(original_window) + + +def download_image(url: str, download_dir: str, proxies): + print(f"开始下载:{url}") + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", + "Referer": "https://monpetit17.fanbox.cc/", + "Cookie": "p_ab_id=0; p_ab_id_2=1; p_ab_d_id=1743506050; _gcl_au=1.1.1973053698.1715103653; cf_clearance=9nglHcFCr9D17iz8392jJQIJi4oN8TSpBsvlS4oPjvs-1715103654-1.0.1.1-DLbaQVWB8O8lyUZyJ5P8ToQD7Bx7dt5r_7KUz9aFqIcEM5GiAunuXwAQRu5BJ3c3zBOt242Oy13YvXz1omV5Dw; FANBOXSESSID=35206256_01xWtpm33E6tjKYTAdLoVaZq5xceRVNB; privacy_policy_agreement=6; privacy_policy_notification=0; _gid=GA1.2.1663028388.1715442061; __cf_bm=YRkVSwzNtFz96eLrGB3KhENnEVv7lkYdc898q2AF3G0-1715447846-1.0.1.1-yJ.D2R_c.jS8SErn4fAmIg6fShYSfc2h_m4vQrehVA5UpRV7rIsjnYCGTZLro7JW9nh1r0Hu853rOcvoy6hSrA; cf_clearance=QQWAR7NhgwYZjtKExO9v0IE2eHNkQweSMLRVugqL3mE-1715447848-1.0.1.1-JhfPisIW0GNy135ks_mIObi9.X.FmmorhRl_Fows5nrHEQuPBt2S7CY_lnB4vCipSp4Xq.QrKW5oscwSCMk_Hw; _gat_gtag_UA_1830249_145=1; _ga_D9TLP3EFER=GS1.1.1715442066.2.1.1715448098.22.0.0; _ga=GA1.1.530858074.1715103653" + } + save_path = os.path.join(download_dir, url.split("/")[-1]) + req = requests.get(url, proxies=proxies, headers=headers) + with open(save_path, "wb") as f: + f.write(req.content) + print(f"下载结束 {url}, 保存路径:{save_path}") + + +def download_images(driver: WebDriver, links, download_dir: str, proxies): + print(f"最大并发下载数: {workers}") + with ThreadPoolExecutor(max_workers=workers) as worker: + img_list = [] + for link in links: + part_img_list = [] + + print(link) + # https://monpetit17.fanbox.cc/posts/5183527 + # https://api.fanbox.cc/post.info?postId=5183527 + driver.get(link) + print("打开页面") + + time.sleep(1) + try: + el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]") + if el is not None: + continue + except Exception as e: + pass + + # html = driver.find_element(By.TAG_NAME, "html") + # height = html.size['height'] + # for _ in range(500, height, 500): + # driver.execute_script("window.scrollBy(0, 500)") + # time.sleep(1.5) + for _ in range(0, 10): + driver.execute_script("window.scrollBy(0, 500)") + time.sleep(0.2) + + img_link_elements = driver.find_elements(By.XPATH, "//a[contains(@class, 'PostImage__Anchor')]") + + sub_dir = link.split("/")[-1] + real_download_dir = os.path.join(download_dir, sub_dir) + if not os.path.exists(real_download_dir): + os.makedirs(real_download_dir) + + for element in img_link_elements: + href = element.get_attribute("href") + print(href) + part_img_list.append(href) + + worker.submit(download_image, href, real_download_dir, proxies) + + print("获取 %d 个图片地址" % len(part_img_list)) + print(part_img_list) + img_list += part_img_list + + print("共 %d 个页面, 共计 %d 个图片" % (len(links), len(img_list))) + + +def main(driver: WebDriver, download_dir: str, proxies): driver.set_window_size(1920, 1080) # driver.get("https://bot.sannysoft.com/")\ - driver.get("https://www.geetest.com/adaptive-captcha-demo") - width = driver.execute_script("return document.documentElement.scrollWidth") - height = driver.execute_script("return document.documentElement.scrollHeight") - print(width, height) - utils.screenshot(driver=driver, save_path="./tmp/geetest.png") \ No newline at end of file + # driver.get("https://www.geetest.com/adaptive-captcha-demo") + # width = driver.execute_script("return document.documentElement.scrollWidth") + # height = driver.execute_script("return document.documentElement.scrollHeight") + # print(width, height) + # utils.screenshot(driver=driver, save_path="./tmp/geetest.png") + + links = [] + + start_page = 1 + url = "https://monpetit17.fanbox.cc/posts?page=%d" % start_page + links += find_link(driver, url) + + while has_next(driver): + start_page += 1 + url = "https://monpetit17.fanbox.cc/posts?page=%d" % start_page + links += find_link(driver, url) + + print("获取子页面链接", links) + + # download_file_link(driver, links) + download_images(driver, links, download_dir, proxies)