import os
import time
from selenium import webdriver
import chromedriver_binary
from PIL import Image
import io
import requests
import hashlib
import os.path
filename = 'search_list.txt'
def load_doc(filename):
file = open(filename, 'r')
text = file.read()
file.close()
return text
def load_set(filename):
doc = load_doc(filename)
dataset = list()
for line in doc.split('\n'):
if len(line) < 1:
continue
identifier = line.split('.')[0]
dataset.append(identifier)
return set(dataset)
def run(search_keyword):
sleep_between_interactions = 2
download_num = 3
query = search_keyword
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
options = webdriver.ChromeOptions()
wd = webdriver.Chrome(chrome_options=options)
wd.get(search_url.format(q=query))
thumbnail_results = wd.find_elements_by_css_selector("img.rg_i")
image_urls = set()
for img in thumbnail_results[:download_num]:
try:
img.click()
time.sleep(sleep_between_interactions)
except Exception:
continue
url_candidates = wd.find_elements_by_class_name('n3VNCb')
for candidate in url_candidates:
url = candidate.get_attribute('src')
if url and 'https' in url:
image_urls.add(url)
time.sleep(sleep_between_interactions+3)
wd.quit()
image_save_folder_path = query
if not os.path.isdir(image_save_folder_path):
os.makedirs(image_save_folder_path)
for url in image_urls:
try:
image_content = requests.get(url).content
except Exception as e:
print(f"ERROR - Could not download {url} - {e}")
try:
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB')
file_path = os.path.join(image_save_folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
with open(file_path, 'wb') as f:
image.save(f, "JPEG", quality=90)
print(f"SUCCESS - saved {url} - as {file_path}")
except Exception as e:
print(f"ERROR - Could not save {url} - {e}")
search_keywords = load_set(filename)
for search_keyword in search_keywords:
run(search_keyword)