async def main() -> None: async with Actor: Actor.log.info('Actor is being executed...') # Process Actor input actor_input = await Actor.get_input() or {} max_depth = actor_input.get('max_depth', 1) start_urls = ['https://gelsf.com/', 'vowconstruction.com/', 'https://prosperdevelopment.com/', 'https://missionhomeremodeling.com/', 'https://www.leefamilycorp.com/', 'https://www.a2zremodelingcal.com/', 'https://lemusco.com/', 'https://www.agcsf.com/', 'https://www.goldenheightsremodeling.com/'] settings = _get_scrapy_settings(max_depth) domain = [] def get_domain(url): try: if not urlparse(url).scheme: url = 'http://' + url parsed_url = urlparse(url) domain = parsed_url.netloc if domain.startswith('www.'): domain = domain[4:] return domain except: print(f'invalid url : {url}') for i in start_urls: a = get_domain(i) domain.append(a) process = CrawlerProcess(settings, install_root_handler=False) process.crawl(Spider, domain=domain, urls=start_urls) process.start() print('Finished scraping. Cleaning data...')
AttributeError: 'Testing' object has no attribute 'is_valid_url'
def is_valid_url(self, url): try: parsed = urlparse(url) return True except Exception as e: print(f"Error validating URL: {e}") return False
from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from scrapy.utils.project import get_project_settings from urllib.parse import urlparse from apify import Actor from apify import Actor from ..items import Items import scrapy import re import json
Item Pipeline
to your Scrapy-Apify project (based on the Scrapy Actor template) it works, and close_spider
method is correctly called after the spider finishes its work. I even tried to use your DataCleaningPipeline
pipeline, and it works, there is not a bug in it.main.py
. I suggest you to keep the main.py
as simple as possible, e.g. like this:from scrapy.crawler import CrawlerProcess from scrapy.settings import Settings from scrapy.utils.project import get_project_settings from apify import Actor from .spiders.title import TitleSpider as Spider def _get_scrapy_settings() -> Settings: settings = get_project_settings() settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000 settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000 settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler' return settings async def main() -> None: async with Actor: Actor.log.info('Actor is being executed...') settings = _get_scrapy_settings() process = CrawlerProcess(settings, install_root_handler=False) process.crawl(Spider) process.start()
start_urls
, domain
, and other related logic to the Spider (class attributes). And then try to debug your Spider code. ... class TestSpider(Spider): name = 'test' second_pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-]+') email_pattern = re.compile(r'(?:mailto:)?[A-Za-z0-9._%+-]+@[A-Za-z.-]+\.[A-Za-z]{2,}') links_pattern = re.compile(r'(twitter\.com|facebook\.com|instagram\.com|linkedin\.com)/') phone_pattern = re.compile(r'tel:\+\d+') start_urls = [ 'https://gelsf.com/', 'https://vowconstruction.com/', 'https://prosperdevelopment.com/', 'https://missionhomeremodeling.com/', 'https://www.leefamilycorp.com/', 'https://www.a2zremodelingcal.com/', 'https://lemusco.com/', 'https://www.agcsf.com/', 'https://www.goldenheightsremodeling.com/', ] allowed_domains = [ 'gelsf.com', 'vowconstruction.com', 'prosperdevelopment.com', 'missionhomeremodeling.com', 'www.leefamilycorp.com', 'www.a2zremodelingcal.com', 'lemusco.com', 'www.agcsf.com', 'www.goldenheightsremodeling.com', ] headers = { ... } ...
I removed the apify.scrapy.pipelines.ActorDataSet
Because it was pushing the non cleaned items
class CleaningPipeline: def process_item(self, item: BookItem, spider: Spider) -> BookItem: number_map = { 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, } return BookItem( title=item['title'], price=float(item['price'].replace('Β£', '')), rating=number_map[item['rating'].split(' ')[1].lower()], in_stock=bool(item['in_stock'].lower() == 'in stock'), )
I dont know what the allowed domains will be
Nor the start urls
No, the allowed domains
Is by user input so I dont know what will the user input be