Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/semantic_kernel/core_plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from semantic_kernel.core_plugins.text_memory_plugin import TextMemoryPlugin
from semantic_kernel.core_plugins.text_plugin import TextPlugin
from semantic_kernel.core_plugins.time_plugin import TimePlugin
from semantic_kernel.core_plugins.web_scraper_plugin import WebScraperPlugin
from semantic_kernel.core_plugins.web_search_engine_plugin import WebSearchEnginePlugin

__all__ = [
Expand All @@ -21,5 +22,6 @@
"TextMemoryPlugin",
"TextPlugin",
"TimePlugin",
"WebScraperPlugin",
"WebSearchEnginePlugin",
]
251 changes: 251 additions & 0 deletions python/semantic_kernel/core_plugins/web_scraper_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
# Copyright (c) Microsoft. All rights reserved.

import json
import logging
from typing import Annotated, Any

import aiohttp

from semantic_kernel.exceptions import FunctionExecutionException
from semantic_kernel.functions.kernel_function_decorator import kernel_function
from semantic_kernel.kernel_pydantic import KernelBaseModel

logger = logging.getLogger(__name__)
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logger is defined but never used in this module, which will fail Ruff (unused variable). Remove the logging import and logger = logging.getLogger(__name__), or use logger for actual logging in this plugin.

Copilot uses AI. Check for mistakes.


class WebScraperPlugin(KernelBaseModel):
"""A plugin that provides web scraping functionality using CRW.

CRW is an open-source web scraper for AI agents that exposes a
Firecrawl-compatible REST API. It supports scraping single pages,
crawling entire websites, and discovering site maps.

GitHub: https://github.com/nicepkg/crw

Usage:
kernel.add_plugin(
WebScraperPlugin(base_url="http://localhost:3000"),
"WebScraper",
)

# With authentication:
kernel.add_plugin(
WebScraperPlugin(
base_url="http://localhost:3000",
api_key="fc-your-api-key",
),
"WebScraper",
)

Examples:
{{WebScraper.scrape_url "https://example.com"}}
{{WebScraper.crawl_website "https://example.com"}}
{{WebScraper.map_site "https://example.com"}}
"""

base_url: str = "http://localhost:3000"
"""Base URL of the CRW server."""

api_key: str | None = None
"""Optional Bearer token for authenticating with the CRW server."""

def _headers(self) -> dict[str, str]:
"""Build request headers including auth if configured."""
headers: dict[str, str] = {"Content-Type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
return headers

async def _post(self, path: str, body: dict[str, Any]) -> dict[str, Any]:
"""Send a POST request to the CRW server and return the JSON response."""
url = f"{self.base_url.rstrip('/')}{path}"
async with (
aiohttp.ClientSession() as session,
session.post(url, headers=self._headers(), data=json.dumps(body)) as response,
):
result = await response.json()
if response.status >= 400:
error_msg = result.get("error", f"HTTP {response.status}")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No request timeout is set on the ClientSession. If the CRW server is unreachable or slow, this coroutine hangs indefinitely. Add an aiohttp.ClientTimeout to bound request duration. The same applies to _get.

Suggested change
error_msg = result.get("error", f"HTTP {response.status}")
async def _post(self, path: str, body: dict[str, Any]) -> dict[str, Any]:
"""Send a POST request to the CRW server and return the JSON response.""
url = f"{self.base_url.rstrip('/')}{path}"
timeout = aiohttp.ClientTimeout(total=30)
async with (
aiohttp.ClientSession(timeout=timeout) as session,

raise FunctionExecutionException(f"CRW request failed: {error_msg}")
return result
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_post() always calls response.json() before checking response.status. If CRW returns a non JSON error body (or invalid JSON), this will raise an aiohttp parsing exception instead of a FunctionExecutionException. Consider checking the status first and reading response.text() on errors, or wrapping JSON parsing in a try except with a safe fallback error message. Same concern applies to _get().

Copilot uses AI. Check for mistakes.

async def _get(self, path: str) -> dict[str, Any]:
Comment on lines +70 to +80
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: response.json() is called before checking response.status. If the server returns a non-JSON error (e.g., 502 with HTML), this raises ContentTypeError/JSONDecodeError instead of FunctionExecutionException. Check status first, then attempt JSON parsing.

Suggested change
):
result = await response.json()
if response.status >= 400:
error_msg = result.get("error", f"HTTP {response.status}")
raise FunctionExecutionException(f"CRW request failed: {error_msg}")
return result
async def _get(self, path: str) -> dict[str, Any]:
async with (
aiohttp.ClientSession() as session,
session.post(url, headers=self._headers(), data=json.dumps(body)) as response,
):
if response.status >= 400:
try:
result = await response.json()
error_msg = result.get("error", f"HTTP {response.status}")
except Exception:
error_msg = f"HTTP {response.status}"
raise FunctionExecutionException(f"CRW request failed: {error_msg}")
return await response.json()

"""Send a GET request to the CRW server and return the JSON response."""
url = f"{self.base_url.rstrip('/')}{path}"
async with (
aiohttp.ClientSession() as session,
session.get(url, headers=self._headers()) as response,
):
result = await response.json()
if response.status >= 400:
error_msg = result.get("error", f"HTTP {response.status}")
raise FunctionExecutionException(f"CRW request failed: {error_msg}")
return result
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same bug in _get: response.json() before status check. Non-JSON error responses will raise an unhandled exception instead of FunctionExecutionException.

Suggested change
aiohttp.ClientSession() as session,
session.get(url, headers=self._headers()) as response,
):
result = await response.json()
if response.status >= 400:
error_msg = result.get("error", f"HTTP {response.status}")
raise FunctionExecutionException(f"CRW request failed: {error_msg}")
return result
async with (
aiohttp.ClientSession() as session,
session.get(url, headers=self._headers()) as response,
):
if response.status >= 400:
try:
result = await response.json()
error_msg = result.get("error", f"HTTP {response.status}")
except Exception:
error_msg = f"HTTP {response.status}"
raise FunctionExecutionException(f"CRW request failed: {error_msg}")
return await response.json()


@kernel_function(
name="scrape_url",
description="Scrape a web page and return its content as markdown",
)
async def scrape_url(
self,
url: Annotated[str, "The URL to scrape"],
formats: Annotated[str | None, "Comma-separated output formats (markdown, html, links, plainText)"] = None,
only_main_content: Annotated[bool, "Strip navigation, footer, and sidebar content"] = True,
css_selector: Annotated[str | None, "CSS selector to extract specific elements"] = None,
) -> str:
"""Scrape a single web page and return its content.

Args:
url: The URL to scrape (must be http or https).
formats: Comma-separated output formats. Defaults to "markdown".
only_main_content: If True, strips navigation, footer, sidebar.
css_selector: Optional CSS selector to extract specific elements.

Returns:
The scraped content as a string.
"""
if not url:
raise FunctionExecutionException("url cannot be `None` or empty")

Comment on lines +108 to +126
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring says url "must be http or https" but the implementation only checks for empty string. Either validate the scheme (and potentially reject unsupported URLs early) or update the docstring to avoid stating requirements that are not enforced.

Copilot uses AI. Check for mistakes.
body: dict[str, Any] = {
"url": url,
"onlyMainContent": only_main_content,
}

if formats:
body["formats"] = [f.strip() for f in formats.split(",")]
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When parsing formats, the current split and strip logic can still include empty entries (for example trailing commas). Consider filtering out empty strings and optionally validating against the supported set so the CRW API does not receive invalid formats.

Suggested change
body["formats"] = [f.strip() for f in formats.split(",")]
# Normalize, filter out empty entries, and validate against supported formats
supported_formats = {"markdown", "html", "plainText", "links"}
requested_formats = [
f.strip() for f in formats.split(",") if f.strip()
]
valid_formats = [f for f in requested_formats if f in supported_formats]
body["formats"] = valid_formats or ["markdown"]

Copilot uses AI. Check for mistakes.
else:
body["formats"] = ["markdown"]

if css_selector:
body["cssSelector"] = css_selector

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

crawl_website's ``@kernel_function description says 'Crawl a website starting from a URL, following links up to a specified depth' but the function returns a job ID string, not crawled content. An LLM invoking this function has no way to know a second polling call is needed. Update the description to reflect the actual return value, or have the function poll until completion.

Suggested change
description="Start an async crawl job and return a job ID. Call check_crawl_status with the returned ID to retrieve results.",

result = await self._post("/v1/scrape", body)

data = result.get("data", {})

# Return markdown first, then fall back to other formats
if data.get("markdown"):
return data["markdown"]
if data.get("html"):
return data["html"]
if data.get("plainText"):
return data["plainText"]
if data.get("links"):
return json.dumps(data["links"])

return json.dumps(data)

@kernel_function(
name="crawl_website",
description="Crawl a website starting from a URL, following links up to a specified depth",
)
async def crawl_website(
self,
url: Annotated[str, "The starting URL to crawl"],
max_depth: Annotated[int, "Maximum link-follow depth"] = 2,
max_pages: Annotated[int, "Maximum number of pages to scrape"] = 10,
) -> str:
"""Start a crawl job and return the crawl job ID.

The crawl runs asynchronously. Use check_crawl_status to poll for results.

Args:
url: The starting URL to crawl.
max_depth: Maximum link-follow depth (default 2).
max_pages: Maximum pages to scrape (default 10).

Returns:
The crawl job ID that can be used with check_crawl_status.
"""
if not url:
raise FunctionExecutionException("url cannot be `None` or empty")

body: dict[str, Any] = {
"url": url,
"maxDepth": max_depth,
"maxPages": max_pages,
"formats": ["markdown"],
}

result = await self._post("/v1/crawl", body)
crawl_id = result.get("id", "")
if not crawl_id:
raise FunctionExecutionException("CRW did not return a crawl job ID")

return crawl_id

@kernel_function(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

crawl_id is user-supplied and interpolated directly into the URL path without sanitization. A malicious value like ../../admin or id?x=y could alter the request target. Use urllib.parse.quote(crawl_id, safe='') to encode path-unsafe characters.

name="check_crawl_status",
description="Check the status and results of a crawl job",
)
async def check_crawl_status(
self,
crawl_id: Annotated[str, "The crawl job ID returned by crawl_website"],
) -> str:
"""Check the status of a running or completed crawl job.

Args:
crawl_id: The crawl job ID returned by crawl_website.

Returns:
JSON string with crawl status and any available results.
"""
if not crawl_id:
raise FunctionExecutionException("crawl_id cannot be `None` or empty")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded [:500] silently truncates page markdown with no indication to the caller that content was cut. Make this a configurable plugin attribute and signal truncation in the returned entry.

Suggested change
"markdown": page.get("markdown", "")[:self.max_markdown_preview],

result = await self._get(f"/v1/crawl/{crawl_id}")

status = result.get("status", "unknown")
pages = result.get("data", [])

summary: dict[str, Any] = {
"status": status,
"total": result.get("total", 0),
"completed": result.get("completed", 0),
}

if pages:
summary["pages"] = [
{
"url": page.get("metadata", {}).get("sourceURL", ""),
"title": page.get("metadata", {}).get("title", ""),
"markdown": page.get("markdown", "")[:500],
}
Comment on lines +238 to +244
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check_crawl_status() truncates each page's markdown to 500 characters, but the docstring says it returns crawl results. This truncation is a behavior change that API consumers may not expect. Consider documenting the truncation explicitly, returning full content, or making the truncation length configurable.

Copilot uses AI. Check for mistakes.
for page in pages
]

return json.dumps(summary, indent=2)

@kernel_function(
name="map_site",
description="Discover all URLs on a website by following links and reading sitemaps",
)
async def map_site(
self,
url: Annotated[str, "The URL to discover links from"],
max_depth: Annotated[int, "Maximum discovery depth"] = 2,
use_sitemap: Annotated[bool, "Also read sitemap.xml"] = True,
) -> str:
"""Discover all URLs on a website.

Args:
url: The URL to discover links from.
max_depth: Maximum discovery depth (default 2).
use_sitemap: Whether to also read sitemap.xml (default True).

Returns:
JSON array of discovered URLs.
"""
if not url:
raise FunctionExecutionException("url cannot be `None` or empty")

body: dict[str, Any] = {
"url": url,
"maxDepth": max_depth,
"useSitemap": use_sitemap,
}

result = await self._post("/v1/map", body)
links = result.get("data", {}).get("links", [])
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likely bug: Firecrawl v1 /map returns {"success": true, "links": [...]} — links are at the top level, not nested under data. If CRW follows the Firecrawl spec, this will always return []. Should this be result.get("links", [])?

Suggested change
links = result.get("data", {}).get("links", [])
links = result.get("links", [])

return json.dumps(links)
Loading
Loading