Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/guides/code_examples/http_crawlers/selectolax_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def is_matching_selector(

@override
def find_links(
self, parsed_content: LexborHTMLParser, selector: str
self, parsed_content: LexborHTMLParser, selector: str, attribute: str
) -> Iterable[str]:
"""Extract href attributes from elements matching the selector.

Expand All @@ -54,7 +54,7 @@ def find_links(
link: LexborNode
urls: list[str] = []
for link in parsed_content.css(selector):
url = link.attributes.get('href')
url = link.attributes.get(attribute)
if url:
urls.append(url.strip())
return urls
5 changes: 5 additions & 0 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ def __call__(
self,
*,
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
Expand All @@ -411,6 +412,7 @@ def __call__(
self,
*,
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
Expand All @@ -428,6 +430,7 @@ def __call__(
- `PlaywrightCrawler` supports CSS and XPath selectors.
- `ParselCrawler` supports CSS selectors.
- `BeautifulSoupCrawler` supports CSS selectors.
attribute: Which node attribute to extract the links from.
label: Label for the newly created `Request` objects, used for request routing.
user_data: User data to be provided to the newly created `Request` objects.
transform_request_function: A function that takes `RequestOptions` and returns either:
Expand Down Expand Up @@ -457,6 +460,7 @@ def __call__(
self,
*,
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
Expand All @@ -470,6 +474,7 @@ def __call__(
- `PlaywrightCrawler` supports CSS and XPath selectors.
- `ParselCrawler` supports CSS selectors.
- `BeautifulSoupCrawler` supports CSS selectors.
attribute: Which node attribute to extract the links from.
label: Label for the newly created `Request` objects, used for request routing.
user_data: User data to be provided to the newly created `Request` objects.
transform_request_function: A function that takes `RequestOptions` and returns either:
Expand Down
7 changes: 5 additions & 2 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def _create_extract_links_function(
async def extract_links(
*,
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
Expand All @@ -191,10 +192,12 @@ async def extract_links(
kwargs.setdefault('strategy', 'same-hostname')
strategy = kwargs.get('strategy', 'same-hostname')

links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
links_iterator: Iterator[str] = iter(
self._parser.find_links(parsed_content, selector=selector, attribute=attribute)
)

# Get base URL from <base> tag if present
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]', 'href'))
base_url: str = (
str(extracted_base_urls[0])
if extracted_base_urls
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/_abstract_http/_abstract_http_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,13 @@ def is_matching_selector(self, parsed_content: TParseResult, selector: str) -> b
"""

@abstractmethod
def find_links(self, parsed_content: TParseResult, selector: str) -> Iterable[str]:
def find_links(self, parsed_content: TParseResult, selector: str, attribute: str) -> Iterable[str]:
"""Find all links in result using selector.

Args:
parsed_content: Parsed HTTP response. Result of `parse` method.
selector: String used to define matching pattern for finding links.
attribute: Which node attribute to extract the links from.

Returns:
Iterable of strings that contain found links.
Expand Down
6 changes: 4 additions & 2 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,7 @@ def _create_enqueue_links_function(
async def enqueue_links(
*,
selector: str | None = None,
attribute: str | None = None,
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
Expand All @@ -1010,9 +1011,9 @@ async def enqueue_links(
kwargs.setdefault('strategy', 'same-hostname')

if requests:
if any((selector, label, user_data, transform_request_function)):
if any((selector, attribute, label, user_data, transform_request_function)):
raise ValueError(
'You cannot provide `selector`, `label`, `user_data` or '
'You cannot provide `selector`, `attribute`, `label`, `user_data` or '
'`transform_request_function` arguments when `requests` is provided.'
)
# Add directly passed requests.
Expand All @@ -1024,6 +1025,7 @@ async def enqueue_links(
await context.add_requests(
await extract_links(
selector=selector or 'a',
attribute=attribute or 'href',
label=label,
user_data=user_data,
transform_request_function=transform_request_function,
Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ async def select(self, parsed_content: Tag, selector: str) -> Sequence[Tag]:
return tuple(match for match in parsed_content.select(selector))

@override
def find_links(self, parsed_content: Tag, selector: str) -> Iterable[str]:
def find_links(self, parsed_content: Tag, selector: str, attribute: str) -> Iterable[str]:
link: Tag
urls: list[str] = []
for link in parsed_content.select(selector):
url = link.attrs.get('href')
url = link.attrs.get(attribute)
if url:
urls.append(url.strip())
return urls
Expand Down
4 changes: 3 additions & 1 deletion src/crawlee/crawlers/_http/_http_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,7 @@ def is_matching_selector(self, parsed_content: bytes, selector: str) -> bool: #
return False

@override
def find_links(self, parsed_content: bytes, selector: str) -> Iterable[str]: # Intentional unused argument.
def find_links(
self, parsed_content: bytes, selector: str, attribute: str
) -> Iterable[str]: # Intentional unused argument.
return []
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_parsel/_parsel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ def is_matching_selector(self, parsed_content: Selector, selector: str) -> bool:
return parsed_content.type in ('html', 'xml') and parsed_content.css(selector).get() is not None

@override
def find_links(self, parsed_content: Selector, selector: str) -> Iterable[str]:
def find_links(self, parsed_content: Selector, selector: str, attribute: str) -> Iterable[str]:
link: Selector
urls: list[str] = []
for link in parsed_content.css(selector):
url = link.xpath('@href').get()
url = link.xpath(f'@{attribute}').get()
if url:
urls.append(url.strip())
return urls
3 changes: 2 additions & 1 deletion src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ def _create_extract_links_function(self, context: PlaywrightPreNavCrawlingContex
async def extract_links(
*,
selector: str = 'a',
attribute: str = 'href',
label: str | None = None,
user_data: dict | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
Expand All @@ -394,7 +395,7 @@ async def extract_links(

elements = await context.page.query_selector_all(selector)
links_iterator: Iterator[str] = iter(
[url for element in elements if (url := await element.get_attribute('href')) is not None]
[url for element in elements if (url := await element.get_attribute(attribute)) is not None]
)

# Get base URL from <base> tag if present
Expand Down
150 changes: 93 additions & 57 deletions tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,41 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:

await crawler.run(requests)

first_visited = visit.call_args_list[0][0][0]
visited = {call[0][0] for call in visit.call_args_list}

assert first_visited == redirect_url
assert visited == {
redirect_url,
str(server_url / 'sub_index'),
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}
expected_visit_calls = [
mock.call(redirect_url),
mock.call(str(server_url / 'sub_index')),
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'page_3')),
mock.call(str(server_url / 'page_4')),
mock.call(str(server_url / 'base_page')),
mock.call(str(server_url / 'base_subpath/page_5')),
]
assert visit.mock_calls[0] == expected_visit_calls[0]
visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_non_href_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
redirect_target = str(server_url / 'start_enqueue_non_href')
redirect_url = str(redirect_server_url.with_path('redirect').with_query(url=redirect_target))
requests = [redirect_url]

crawler = BeautifulSoupCrawler(http_client=http_client)
visit = mock.Mock()

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
visit(context.request.url)
await context.enqueue_links(selector='img', attribute='src')

await crawler.run(requests)

expected_visit_calls = [
mock.call(redirect_url),
mock.call(str(server_url / 'base_subpath/image_1')),
mock.call(str(server_url / 'image_2')),
]
visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_links_selector(server_url: URL, http_client: HttpClient) -> None:
Expand All @@ -77,8 +98,11 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:

await crawler.run([str(server_url / 'start_enqueue')])

visited = {call[0][0] for call in visit.call_args_list}
assert visited == {str(server_url / 'start_enqueue'), str(server_url / 'sub_index')}
expected_visit_calls = [
mock.call(str(server_url / 'start_enqueue')),
mock.call(str(server_url / 'sub_index')),
]
visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpClient) -> None:
Expand Down Expand Up @@ -128,18 +152,17 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:

await crawler.run([str(server_url / 'start_enqueue')])

visited = {call[0][0] for call in visit.call_args_list}

# url /page_3 should not be visited
assert visited == {
str(server_url / 'start_enqueue'),
str(server_url / 'sub_index'),
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'base_page'),
str(server_url / 'page_4'),
str(server_url / 'base_subpath/page_5'),
}
expected_visit_calls = [
mock.call(str(server_url / 'start_enqueue')),
mock.call(str(server_url / 'sub_index')),
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'base_page')),
mock.call(str(server_url / 'page_4')),
mock.call(str(server_url / 'base_subpath/page_5')),
]
visit.assert_has_calls(expected_visit_calls, any_order=True)

# # all urls added to `enqueue_links` must have a custom header
assert headers[1]['transform-header'] == 'my-header'
Expand Down Expand Up @@ -167,14 +190,14 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await context.enqueue_links()

await crawler.run([str(server_url / 'start_enqueue')])
visited = {call[0][0] for call in visit.call_args_list}

assert visited == {
str(server_url / 'start_enqueue'),
str(server_url / 'sub_index'),
str(server_url / 'base_page'),
str(server_url / 'base_subpath/page_5'),
}
expected_visit_calls = [
mock.call(str(server_url / 'start_enqueue')),
mock.call(str(server_url / 'sub_index')),
mock.call(str(server_url / 'base_page')),
mock.call(str(server_url / 'base_subpath/page_5')),
]
visit.assert_has_calls(expected_visit_calls, any_order=True)


async def test_respect_robots_txt_with_problematic_links(server_url: URL, http_client: HttpClient) -> None:
Expand All @@ -198,17 +221,19 @@ async def error_handler(context: BasicCrawlingContext, _error: Exception) -> Non

await crawler.run([str(server_url / 'problematic_links')])

visited = {call[0][0] for call in visit.call_args_list}
failed = {call[0][0] for call in fail.call_args_list}

# Email must be skipped
# https://avatars.githubusercontent.com/apify does not get robots.txt, but is correct for the crawler.
assert visited == {str(server_url / 'problematic_links'), 'https://avatars.githubusercontent.com/apify'}
expected_visit_calls = [
mock.call(str(server_url / 'problematic_links')),
mock.call('https://avatars.githubusercontent.com/apify'),
]
visit.assert_has_calls(expected_visit_calls, any_order=True)

# The budplaceholder.com does not exist.
assert failed == {
'https://budplaceholder.com/',
}
expected_fail_calls = [
mock.call('https://budplaceholder.com/'),
]
fail.assert_has_calls(expected_fail_calls, any_order=True)


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
Expand All @@ -225,14 +250,13 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:

await crawler.run([str(server_url / 'start_enqueue')])

skipped = {call[0][0] for call in skip.call_args_list}

assert skipped == {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
str(server_url / 'page_4'),
}
expected_skip_calls = [
mock.call(str(server_url / 'page_1')),
mock.call(str(server_url / 'page_2')),
mock.call(str(server_url / 'page_3')),
mock.call(str(server_url / 'page_4')),
]
skip.assert_has_calls(expected_skip_calls, any_order=True)


async def test_extract_links(server_url: URL, http_client: HttpClient) -> None:
Expand All @@ -250,6 +274,21 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
assert extracted_links[0] == str(server_url / 'page_1')


async def test_extract_non_href_links(server_url: URL, http_client: HttpClient) -> None:
crawler = BeautifulSoupCrawler(http_client=http_client)
extracted_links: list[str] = []

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
links = await context.extract_links(selector='li', attribute='data-href')
extracted_links.extend(request.url for request in links)

await crawler.run([str(server_url / 'non_href_links')])

assert len(extracted_links) == 1
assert extracted_links[0] == str(server_url / 'page_2')


@pytest.mark.parametrize(
('queue_name', 'queue_alias', 'by_id'),
[
Expand Down Expand Up @@ -444,12 +483,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:

await crawler.run(requests)

first_visited = visit.call_args_list[0][0][0]
visited = {call[0][0] for call in visit.call_args_list}

assert first_visited == start_url
# Only one link should be enqueued from sub_index due to the limit
assert visited == {
start_url,
str(server_url / 'page_3'),
}
expected_visit_calls = [
mock.call(start_url),
mock.call(str(server_url / 'page_3')),
]
visit.assert_has_calls(expected_visit_calls, any_order=True)
Loading
Loading