批量爬取 fanbox
This commit is contained in:
parent
4accaec286
commit
e385e9845d
110
run/__init__.py
110
run/__init__.py
@ -47,29 +47,17 @@ def has_next(driver: WebDriver):
|
|||||||
return has_next
|
return has_next
|
||||||
|
|
||||||
|
|
||||||
def download_file_link(driver: WebDriver, links):
|
def download_file_link(driver: WebDriver):
|
||||||
for link in links:
|
try:
|
||||||
print(link)
|
el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]")
|
||||||
driver.get(link)
|
if el is not None:
|
||||||
|
print(el)
|
||||||
time.sleep(1)
|
a = el.find_element(By.TAG_NAME, "a")
|
||||||
# 获取所有打开的窗口句柄
|
if a is not None:
|
||||||
# all_windows = driver.window_handles
|
a.click()
|
||||||
# for window in all_windows:
|
time.sleep(1)
|
||||||
# if window != original_window:
|
except Exception as e:
|
||||||
# driver.switch_to.window(window)
|
pass
|
||||||
try:
|
|
||||||
el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]")
|
|
||||||
if el is not None:
|
|
||||||
print(el)
|
|
||||||
a = el.find_element(By.TAG_NAME, "a")
|
|
||||||
if a is not None:
|
|
||||||
a.click()
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
time.sleep(5)
|
|
||||||
# driver.back()
|
|
||||||
# driver.switch_to(original_window)
|
|
||||||
|
|
||||||
|
|
||||||
def download_image(url: str, download_dir: str, proxies):
|
def download_image(url: str, download_dir: str, proxies):
|
||||||
@ -86,55 +74,37 @@ def download_image(url: str, download_dir: str, proxies):
|
|||||||
print(f"下载结束 {url}, 保存路径:{save_path}")
|
print(f"下载结束 {url}, 保存路径:{save_path}")
|
||||||
|
|
||||||
|
|
||||||
def download_images(driver: WebDriver, links, download_dir: str, proxies):
|
def download_images(driver: WebDriver, link, download_dir: str, proxies):
|
||||||
print(f"最大并发下载数: {workers}")
|
print(f"最大并发下载数: {workers}")
|
||||||
with ThreadPoolExecutor(max_workers=workers) as worker:
|
with ThreadPoolExecutor(max_workers=workers) as worker:
|
||||||
img_list = []
|
part_img_list = []
|
||||||
for link in links:
|
|
||||||
part_img_list = []
|
|
||||||
|
|
||||||
print(link)
|
try:
|
||||||
# https://monpetit17.fanbox.cc/posts/5183527
|
el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]")
|
||||||
# https://api.fanbox.cc/post.info?postId=5183527
|
if el is not None:
|
||||||
driver.get(link)
|
return
|
||||||
print("打开页面")
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
time.sleep(1)
|
for _ in range(0, 15):
|
||||||
try:
|
driver.execute_script("window.scrollBy(0, 500)")
|
||||||
el = driver.find_element(By.XPATH, "//div[contains(@class, 'FileContent__Wrapper')]")
|
time.sleep(0.2)
|
||||||
if el is not None:
|
|
||||||
continue
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# html = driver.find_element(By.TAG_NAME, "html")
|
img_link_elements = driver.find_elements(By.XPATH, "//a[contains(@class, 'PostImage__Anchor')]")
|
||||||
# height = html.size['height']
|
|
||||||
# for _ in range(500, height, 500):
|
|
||||||
# driver.execute_script("window.scrollBy(0, 500)")
|
|
||||||
# time.sleep(1.5)
|
|
||||||
for _ in range(0, 10):
|
|
||||||
driver.execute_script("window.scrollBy(0, 500)")
|
|
||||||
time.sleep(0.2)
|
|
||||||
|
|
||||||
img_link_elements = driver.find_elements(By.XPATH, "//a[contains(@class, 'PostImage__Anchor')]")
|
sub_dir = link.split("/")[-1]
|
||||||
|
real_download_dir = os.path.join(download_dir, sub_dir)
|
||||||
|
if not os.path.exists(real_download_dir):
|
||||||
|
os.makedirs(real_download_dir)
|
||||||
|
|
||||||
sub_dir = link.split("/")[-1]
|
for element in img_link_elements:
|
||||||
real_download_dir = os.path.join(download_dir, sub_dir)
|
href = element.get_attribute("href")
|
||||||
if not os.path.exists(real_download_dir):
|
print(href)
|
||||||
os.makedirs(real_download_dir)
|
part_img_list.append(href)
|
||||||
|
worker.submit(download_image, href, real_download_dir, proxies)
|
||||||
|
|
||||||
for element in img_link_elements:
|
print("获取 %d 个图片地址" % len(part_img_list))
|
||||||
href = element.get_attribute("href")
|
print(part_img_list)
|
||||||
print(href)
|
|
||||||
part_img_list.append(href)
|
|
||||||
|
|
||||||
worker.submit(download_image, href, real_download_dir, proxies)
|
|
||||||
|
|
||||||
print("获取 %d 个图片地址" % len(part_img_list))
|
|
||||||
print(part_img_list)
|
|
||||||
img_list += part_img_list
|
|
||||||
|
|
||||||
print("共 %d 个页面, 共计 %d 个图片" % (len(links), len(img_list)))
|
|
||||||
|
|
||||||
|
|
||||||
def main(driver: WebDriver, download_dir: str, proxies):
|
def main(driver: WebDriver, download_dir: str, proxies):
|
||||||
@ -161,8 +131,11 @@ def main(driver: WebDriver, download_dir: str, proxies):
|
|||||||
url = page_url % start_page
|
url = page_url % start_page
|
||||||
links += find_link(driver, url)
|
links += find_link(driver, url)
|
||||||
|
|
||||||
download_file_link(driver, links)
|
for link in links:
|
||||||
download_images(driver, links, download_dir, proxies)
|
driver.get(link)
|
||||||
|
time.sleep(1)
|
||||||
|
download_file_link(driver)
|
||||||
|
download_images(driver, link, download_dir, proxies)
|
||||||
|
|
||||||
links = []
|
links = []
|
||||||
|
|
||||||
@ -175,4 +148,7 @@ def main(driver: WebDriver, download_dir: str, proxies):
|
|||||||
links.append(sub_page)
|
links.append(sub_page)
|
||||||
|
|
||||||
print("获取子页面链接", links)
|
print("获取子页面链接", links)
|
||||||
download_images(driver, links, download_dir, proxies)
|
for link in links:
|
||||||
|
driver.get(link)
|
||||||
|
time.sleep(1)
|
||||||
|
download_images(driver, link, download_dir, proxies)
|
||||||
|
Loading…
Reference in New Issue
Block a user