批量爬取 fanbox

This commit is contained in:
shikong 2024-05-12 02:52:40 +08:00
parent 4accaec286
commit e385e9845d

View File

@ -47,17 +47,7 @@ def has_next(driver: WebDriver):
return has_next return has_next
def download_file_link(driver: WebDriver, links): def download_file_link(driver: WebDriver):
for link in links:
print(link)
driver.get(link)
time.sleep(1)
# 获取所有打开的窗口句柄
# all_windows = driver.window_handles
# for window in all_windows:
# if window != original_window:
# driver.switch_to.window(window)
try: try:
el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]") el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]")
if el is not None: if el is not None:
@ -65,11 +55,9 @@ def download_file_link(driver: WebDriver, links):
a = el.find_element(By.TAG_NAME, "a") a = el.find_element(By.TAG_NAME, "a")
if a is not None: if a is not None:
a.click() a.click()
time.sleep(1)
except Exception as e: except Exception as e:
print(e) pass
time.sleep(5)
# driver.back()
# driver.switch_to(original_window)
def download_image(url: str, download_dir: str, proxies): def download_image(url: str, download_dir: str, proxies):
@ -86,33 +74,19 @@ def download_image(url: str, download_dir: str, proxies):
print(f"下载结束 {url}, 保存路径:{save_path}") print(f"下载结束 {url}, 保存路径:{save_path}")
def download_images(driver: WebDriver, links, download_dir: str, proxies): def download_images(driver: WebDriver, link, download_dir: str, proxies):
print(f"最大并发下载数: {workers}") print(f"最大并发下载数: {workers}")
with ThreadPoolExecutor(max_workers=workers) as worker: with ThreadPoolExecutor(max_workers=workers) as worker:
img_list = []
for link in links:
part_img_list = [] part_img_list = []
print(link)
# https://monpetit17.fanbox.cc/posts/5183527
# https://api.fanbox.cc/post.info?postId=5183527
driver.get(link)
print("打开页面")
time.sleep(1)
try: try:
el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]") el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]")
if el is not None: if el is not None:
continue return
except Exception as e: except Exception as e:
pass pass
# html = driver.find_element(By.TAG_NAME, "html") for _ in range(0, 15):
# height = html.size['height']
# for _ in range(500, height, 500):
# driver.execute_script("window.scrollBy(0, 500)")
# time.sleep(1.5)
for _ in range(0, 10):
driver.execute_script("window.scrollBy(0, 500)") driver.execute_script("window.scrollBy(0, 500)")
time.sleep(0.2) time.sleep(0.2)
@ -127,14 +101,10 @@ def download_images(driver: WebDriver, links, download_dir: str, proxies):
href = element.get_attribute("href") href = element.get_attribute("href")
print(href) print(href)
part_img_list.append(href) part_img_list.append(href)
worker.submit(download_image, href, real_download_dir, proxies) worker.submit(download_image, href, real_download_dir, proxies)
print("获取 %d 个图片地址" % len(part_img_list)) print("获取 %d 个图片地址" % len(part_img_list))
print(part_img_list) print(part_img_list)
img_list += part_img_list
print("%d 个页面, 共计 %d 个图片" % (len(links), len(img_list)))
def main(driver: WebDriver, download_dir: str, proxies): def main(driver: WebDriver, download_dir: str, proxies):
@ -161,8 +131,11 @@ def main(driver: WebDriver, download_dir: str, proxies):
url = page_url % start_page url = page_url % start_page
links += find_link(driver, url) links += find_link(driver, url)
download_file_link(driver, links) for link in links:
download_images(driver, links, download_dir, proxies) driver.get(link)
time.sleep(1)
download_file_link(driver)
download_images(driver, link, download_dir, proxies)
links = [] links = []
@ -175,4 +148,7 @@ def main(driver: WebDriver, download_dir: str, proxies):
links.append(sub_page) links.append(sub_page)
print("获取子页面链接", links) print("获取子页面链接", links)
download_images(driver, links, download_dir, proxies) for link in links:
driver.get(link)
time.sleep(1)
download_images(driver, link, download_dir, proxies)