Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ For example::
The examples above extract an article from the page, but you may want to
extract a different type of item, like a product or a job posting. It is
as easy as using the correct type annotation in the callback. This
is how the callback looks like if we need to extract a real state
is how the callback looks like if we need to extract real estate data
from the page::

def parse(self,
Expand Down Expand Up @@ -245,6 +245,13 @@ Provider settings
- ``AUTOEXTRACT_CACHE_GZIP`` [optional] when True (default), cached AutoExtract
responses are compressed using gzip. Set this option to False to turn
compression off.
- ``AUTOEXTRACT_CACHE_COLLECTION`` [optional] when True, AutoExtract responses
are stored in Scrapy Cloud collection named after job id,
e.g. ``111_222_333_cache`` for job ``111/222/333``.
Using collections is mutually exclusive with using ``AUTOEXTRACT_CACHE_FILENAME`` setting.
If the spider is run locally, project number should be set in ``DEV_PROJECT`` setting.
Default collection name is ``dev_cache``.
The collection name can be customised by using ``AUTOEXTRACT_CACHE_COLLECTION_NAME`` setting.

Limitations
===========
Expand Down
32 changes: 31 additions & 1 deletion scrapy_autoextract/cache.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import abc
import json
import gzip
import json
import pickle
import sqlite3

import sqlitedict
from autoextract.request import Request
from scrapinghub import NotFound, ScrapinghubClient


class _Cache(abc.ABC):
Expand Down Expand Up @@ -88,3 +89,32 @@ def __setitem__(self, fingerprint: str, value) -> None:

def close(self):
self.db.close()


class ScrapyCloudCollectionCache(_Cache):
def __init__(self, project, collection):
self.sc = ScrapinghubClient()
self.collection = self.sc.get_project(project).collections.get_store(collection)

@classmethod
def fingerprint(cls, request: Request) -> str:
return json.dumps(
request.as_dict(),
ensure_ascii=False,
sort_keys=True
)

def __getitem__(self, fingerprint: str):
try:
return self.collection.get(fingerprint)
except NotFound:
raise KeyError

def __setitem__(self, fingerprint: str, value) -> None:
self.collection.set(
{'_key': fingerprint,
'value': value}
)

def close(self):
self.sc.close()
41 changes: 29 additions & 12 deletions scrapy_autoextract/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,28 @@
import logging
import os
from asyncio import CancelledError
from typing import Callable, Set, ClassVar, List, Any, Hashable
from typing import Any, Callable, ClassVar, Hashable, List, Set

import aiohttp
from scrapy import Request as ScrapyRequest, signals
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from autoextract.aio import request_raw, create_session
from autoextract.aio.errors import RequestError, \
ACCOUNT_DISABLED_ERROR_TYPE
from autoextract.aio import create_session, request_raw
from autoextract.aio.errors import ACCOUNT_DISABLED_ERROR_TYPE, RequestError
from autoextract.aio.retry import RetryFactory
from autoextract.request import Request as AutoExtractRequest
from autoextract.stats import AggStats
from autoextract_poet.page_inputs import (
AutoExtractProductData, AutoExtractData, AutoExtractHtml,
)
from autoextract_poet.page_inputs import (AutoExtractData, AutoExtractHtml,
AutoExtractProductData)
from scrapy import Request as ScrapyRequest
from scrapy import signals
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from scrapy_poet.page_input_providers import PageObjectInputProvider

from .cache import AutoExtractCache, DummyCache, ScrapyCloudCollectionCache
from .errors import QueryError, summarize_exception
from .slot_semaphore import SlotsSemaphore
from .task_manager import TaskManager
from .utils import get_domain, get_scrapy_data_path
from .cache import AutoExtractCache, DummyCache
from .utils import (get_collection_name, get_domain, get_project_from_job,
get_scrapy_data_path)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -93,11 +94,19 @@ def __init__(self, crawler: Crawler):
self.per_domain_semaphore = SlotsSemaphore(per_domain_concurrency)

cache_filename = self.settings.get('AUTOEXTRACT_CACHE_FILENAME')
cache_collection = self.settings.get('AUTOEXTRACT_CACHE_COLLECTION')
check_configuration(cache_filename, cache_collection)
if cache_filename:
cache_filename = os.path.join(get_scrapy_data_path(createdir=True),
cache_filename)
compressed = self.settings.getbool('AUTOEXTRACT_CACHE_GZIP', True)
self.cache = AutoExtractCache(cache_filename, compressed=compressed)
elif cache_collection:
project = get_project_from_job() or self.settings.get('DEV_PROJECT')
self.cache = ScrapyCloudCollectionCache(
project,
get_collection_name(self)
)
else:
self.cache = DummyCache()

Expand Down Expand Up @@ -263,3 +272,11 @@ def inc_stats(suffix, value=1, both=False):
inc_stats("/pages/success", both=True)

return instances


def check_configuration(cache_filename, cache_collection):
if all([cache_filename, cache_collection]):
raise ValueError(
"Configuration error. "
"Both AUTOEXTRACT_CACHE_FILENAME and AUTOEXTRACT_CACHE_COLLECTION defined in settings."
)
16 changes: 16 additions & 0 deletions scrapy_autoextract/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,19 @@ def get_scrapy_data_path(createdir=True):
if createdir:
os.makedirs(path, exist_ok=True)
return path


def get_collection_name(provider):
from_settings = provider.settings.get('AUTOEXTRACT_CACHE_COLLECTION_NAME')
scrapy_job = os.environ.get('SCRAPY_JOB')
if from_settings:
return from_settings
elif scrapy_job:
return f"{scrapy_job.replace('/', '_')}_cache"
return 'dev_cache'


def get_project_from_job():
scrapy_job = os.environ.get('SCRAPY_JOB')
if scrapy_job:
return scrapy_job.split('/')[0]
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def get_version():
'aiohttp',
'tldextract',
'sqlitedict>=1.7.0',
'scrapinghub',
],
keywords='scrapy autoextract middleware',
classifiers=[
Expand Down