import threading
import requests
from upstash_redis import Redis
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Initialize Redis client
redis = Redis.from_env()
# Group URLs by thread, with one or two overlapping URLs across groups
urls_to_scrape_groups = [
[
'https://httpbin.org/delay/1',
'https://httpbin.org/delay/4',
'https://httpbin.org/delay/2',
'https://httpbin.org/delay/5',
'https://httpbin.org/delay/3',
],
[
'https://httpbin.org/delay/5', # Overlapping URL
'https://httpbin.org/delay/6',
'https://httpbin.org/delay/7',
'https://httpbin.org/delay/2', # Overlapping URL
'https://httpbin.org/delay/8',
],
[
'https://httpbin.org/delay/3', # Overlapping URL
'https://httpbin.org/delay/9',
'https://httpbin.org/delay/10',
'https://httpbin.org/delay/4', # Overlapping URL
'https://httpbin.org/delay/11',
],
]
class Scraper(threading.Thread):
def __init__(self, urls):
threading.Thread.__init__(self)
self.urls = urls
self.results = {}
def run(self):
for url in self.urls:
cache_key = f"url:{url}"
# Attempt to retrieve cached response
cached_response = redis.get(cache_key)
if cached_response:
print(f"[CACHE HIT] {self.name} - URL: {url}")
self.results[url] = cached_response
continue # Skip to the next URL if cache is found
# If no cache, perform the HTTP request
print(f"[FETCHING] {self.name} - URL: {url}")
response = requests.get(url)
if response.status_code == 200:
self.results[url] = response.text
# Store the response in Redis cache
redis.set(cache_key, response.text)
else:
print(f"[ERROR] {self.name} - Failed to retrieve {url}")
self.results[url] = None
def main():
threads = []
for urls in urls_to_scrape_groups:
scraper = Scraper(urls)
threads.append(scraper)
scraper.start()
# Wait for all threads to complete
for scraper in threads:
scraper.join()
print("\nScraping results:")
for scraper in threads:
for url, result in scraper.results.items():
print(f"Thread {scraper.name} - URL: {url} - Response Length: {len(result) if result else 'Failed'}")
if __name__ == "__main__":
main()