-
Notifications
You must be signed in to change notification settings - Fork 17
AutoExtractProvider now support the new scrapy-poet cache interface #31
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 3 commits
d82fa27
332af78
15a8237
acc66ca
1e2be8a
def4914
0f0dd55
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,13 @@ | ||
| import inspect | ||
| import json | ||
| import logging | ||
| import os | ||
| from asyncio import CancelledError | ||
| from typing import Callable, Set, ClassVar, List, Any, Hashable | ||
| from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar | ||
|
|
||
| import aiohttp | ||
| from scrapy import Request as ScrapyRequest, signals | ||
| import attr | ||
| from scrapy import Request as ScrapyRequest, signals, Request | ||
| from scrapy.crawler import Crawler | ||
| from scrapy.settings import Settings | ||
| from autoextract.aio import request_raw, create_session | ||
|
|
@@ -30,6 +32,9 @@ | |
| _TASK_MANAGER = "_autoextract_task_manager" | ||
|
|
||
|
|
||
| AEDataType = TypeVar('AEDataType', bound=AutoExtractData, covariant=True) | ||
|
||
|
|
||
|
|
||
| def get_autoextract_task_manager(crawler: Crawler) -> TaskManager: | ||
| """ | ||
| Return the configured :class:`scrapy_autoextract.TaskManager` that controls | ||
|
|
@@ -66,11 +71,20 @@ def get_concurrent_requests_per_domain(settings: Settings): | |
| return concurrency | ||
|
|
||
|
|
||
| @attr.define() | ||
| class AERequestSpec: | ||
| query: List[AutoExtractRequest] | ||
BurnzZ marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| should_request_html: bool | ||
| is_extraction_required: bool | ||
| provided_cls: Callable | ||
|
|
||
|
|
||
| class AutoExtractProvider(PageObjectInputProvider): | ||
| """Provider for AutoExtract data""" | ||
| # pageType requested when only html is required | ||
| page_type_class_for_html: ClassVar[AutoExtractData] = AutoExtractProductData | ||
| html_query_attribute: ClassVar[str] = "fullHtml" | ||
| name = "autoextract" | ||
|
|
||
| @classmethod | ||
| def provided_classes(cls, type_: Callable) -> bool: | ||
|
|
@@ -172,7 +186,7 @@ def get_per_domain_concurrency_slot(self, request: ScrapyRequest) -> Hashable: | |
|
|
||
| def get_filled_request(self, | ||
| request: ScrapyRequest, | ||
| provided_cls: AutoExtractData, | ||
| provided_cls: Type[AEDataType], | ||
| should_request_html: bool) -> AutoExtractRequest: | ||
| """Return a filled request for AutoExtract""" | ||
| ae_request = AutoExtractRequest( | ||
|
|
@@ -183,51 +197,68 @@ def get_filled_request(self, | |
| ae_request.extra = {self.html_query_attribute: True} | ||
| return ae_request | ||
|
|
||
| async def __call__(self, | ||
| to_provide: Set[Callable], | ||
| request: ScrapyRequest | ||
| ) -> List: | ||
| """Make an AutoExtract request and build a Page Input of provided class | ||
| based on API response data. | ||
| """ | ||
| if not self.aiohttp_session: | ||
| self.aiohttp_session = await self.create_aiohttp_session() | ||
| def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest): | ||
BurnzZ marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| is_html_required = AutoExtractHtml in to_provide | ||
| to_provide -= {AutoExtractHtml} | ||
| is_extraction_required = bool(to_provide) | ||
| if is_html_required and not is_extraction_required: | ||
| # At least one request is required to get html | ||
| to_provide = {self.page_type_class_for_html} | ||
|
|
||
| instances = [] | ||
| if not to_provide: | ||
| return [] | ||
| # Use the recommended type for html request, otherwise use the first one | ||
| class_for_html = (self.page_type_class_for_html | ||
| if self.page_type_class_for_html in to_provide | ||
| else next(iter(to_provide))) | ||
| specs = [] | ||
| for idx, provided_cls in enumerate(to_provide): | ||
| if not issubclass(provided_cls, AutoExtractData): | ||
| raise RuntimeError( | ||
| f"Unexpected {provided_cls} requested. Probably a bug in the provider " | ||
| "or in scrapy-poet itself") | ||
|
|
||
| # html is requested only a single time to save resources | ||
| should_request_html = is_html_required and provided_cls is class_for_html | ||
| ae_request = self.get_filled_request( | ||
| request, | ||
| provided_cls, | ||
| should_request_html | ||
| ) | ||
| specs.append(AERequestSpec( | ||
| [ae_request], | ||
| should_request_html, | ||
| is_extraction_required, | ||
| provided_cls | ||
| )) | ||
| return specs | ||
|
|
||
| async def __call__(self, | ||
| to_provide: Set[Callable], | ||
| request: ScrapyRequest | ||
| ) -> List: | ||
| """Make an AutoExtract request and build a Page Input of provided class | ||
| based on API response data. | ||
| """ | ||
| if not self.aiohttp_session: | ||
| self.aiohttp_session = await self.create_aiohttp_session() | ||
|
|
||
| instances = [] | ||
| for spec in self.list_required_requests(to_provide, request): | ||
| request_stats = AggStats() | ||
| is_first_request = idx == 0 | ||
| page_type = provided_cls.page_type | ||
| page_type = spec.provided_cls.page_type | ||
|
|
||
| def inc_stats(suffix, value=1, both=False): | ||
| self.stats.inc_value(f"autoextract/total{suffix}", value) | ||
| if both: | ||
| self.stats.inc_value(f"autoextract/{page_type}{suffix}", value) | ||
|
|
||
| try: | ||
| # html is requested only a single time to save resources | ||
| should_request_html = is_html_required and is_first_request | ||
| slot = self.get_per_domain_concurrency_slot(request) | ||
| ae_request = self.get_filled_request( | ||
| request, | ||
| provided_cls, | ||
| should_request_html | ||
| ) | ||
| # When providing same-name arguments in both call and `__init__` | ||
| # this implementation will not cause any errors (while name=value implementation would), | ||
| # so predefined `__init__` arguments would override the same arguments in the call | ||
| awaitable = self.do_request_cached(**{ | ||
| 'query': [ae_request], | ||
| 'query': spec.query, | ||
| 'agg_stats': request_stats, | ||
| **self.common_request_kwargs, | ||
| }) | ||
|
|
@@ -251,15 +282,29 @@ def inc_stats(suffix, value=1, both=False): | |
| inc_stats("/attempts/count", request_stats.n_attempts) | ||
| inc_stats("/attempts/billable", request_stats.n_billable_query_responses) | ||
|
|
||
| if should_request_html: | ||
| if spec.should_request_html: | ||
| instances.append(AutoExtractHtml(url=data[page_type]['url'], | ||
| html=data['html'])) | ||
| inc_stats("/pages/html", both=True) | ||
|
|
||
| if is_extraction_required: | ||
| if spec.is_extraction_required: | ||
| without_html = {k: v for k, v in data.items() if k != "html"} | ||
| instances.append(provided_cls(data=without_html)) | ||
| instances.append(spec.provided_cls(data=without_html)) | ||
|
|
||
| inc_stats("/pages/success", both=True) | ||
|
|
||
| return instances | ||
|
|
||
| def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: | ||
| queries = [spec.query[0].as_dict() for spec in self.list_required_requests(to_provide, request)] | ||
| # pageType is the key to sort by it | ||
| by_page_type = {query['pageType']: query for query in queries} | ||
| for query in by_page_type.values(): | ||
| query.pop("pageType") | ||
| return json.dumps(by_page_type, sort_keys=True) | ||
|
|
||
| def serialize(self, result: Sequence[Any]) -> Any: | ||
| return result | ||
|
|
||
| def deserialize(self, data: Any) -> Sequence[Any]: | ||
| return data | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,7 +29,7 @@ def get_version(): | |
| install_requires=[ | ||
| 'autoextract-poet>=0.3.0', | ||
| 'zyte-autoextract>=0.7.0', | ||
| 'scrapy-poet>=0.2.0', | ||
| 'scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@injector_record_replay_native#egg=scrapy-poet', | ||
|
||
| 'aiohttp', | ||
| 'tldextract', | ||
| 'sqlitedict>=1.7.0', | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.