Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 20 additions & 20 deletions agentstack/_tools/agentql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,32 @@ def query_data(url: str, query: Optional[str], prompt: Optional[str]) -> dict:

AgentQL query to scrape the url.

Here is a guide on AgentQL query syntax:
Here is a guide on AgentQL query syntax:

Enclose all AgentQL query terms within curly braces `{}`. The following query structure isn't valid because the term "social\_media\_links" is wrongly enclosed within parenthesis `()`.
Enclose all AgentQL query terms within curly braces `{}`. The following query structure isn't valid because the term "social_media_links" is wrongly enclosed within parenthesis `()`.

```
( # Should be {
social_media_links(The icons that lead to Facebook, Snapchat, etc.)[]
) # Should be }
```
```
( # Should be {
social_media_links(The icons that lead to Facebook, Snapchat, etc.)[]
) # Should be }
```

The following query is also invalid since its missing the curly braces `{}`
The following query is also invalid since its missing the curly braces `{}`

```
# should include {
social_media_links(The icons that lead to Facebook, Snapchat, etc.)[]
# should include }
```
```
# should include {
social_media_links(The icons that lead to Facebook, Snapchat, etc.)[]
# should include }
```

You can't include new lines in your semantic context. The following query structure isn't valid because the semantic context isn't contained within one line.
You can't include new lines in your semantic context. The following query structure isn't valid because the semantic context isn't contained within one line.

```
{
social_media_links(The icons that lead
to Facebook, Snapchat, etc.)[]
}
```
```
{
social_media_links(The icons that lead
to Facebook, Snapchat, etc.)[]
}
```
"""
payload = {
"url": url,
Expand Down
105 changes: 104 additions & 1 deletion agentstack/_tools/firecrawl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from firecrawl import FirecrawlApp

from typing import List, Dict, Any, Optional
app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_API_KEY'))


Expand Down Expand Up @@ -38,3 +38,106 @@ def retrieve_web_crawl(crawl_id: str):
will tell you if the crawl is finished. If it is not, wait some more time then try again.
"""
return app.check_crawl_status(crawl_id)


def batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']):
"""
Batch scrape multiple URLs simultaneously.

Args:
urls: List of URLs to scrape
formats: List of desired output formats (e.g., ['markdown', 'html'])

Returns:
Dictionary containing the batch scrape results
"""
batch_result = app.batch_scrape_urls(urls, {'formats': formats})
return batch_result


def async_batch_scrape(urls: List[str], formats: List[str] = ['markdown', 'html']):
"""
Asynchronously batch scrape multiple URLs.

Args:
urls: List of URLs to scrape
formats: List of desired output formats (e.g., ['markdown', 'html'])

Returns:
Dictionary containing the job ID and status URL
"""
batch_job = app.async_batch_scrape_urls(urls, {'formats': formats})
return batch_job


def check_batch_status(job_id: str):
"""
Check the status of an asynchronous batch scrape job.

Args:
job_id: The ID of the batch scrape job

Returns:
Dictionary containing the current status and results if completed
"""
return app.check_batch_scrape_status(job_id)


def extract_data(urls: List[str], schema: Optional[Dict[str, Any]] = None, prompt: Optional[str] = None) -> Dict[
str, Any]:
"""
Extract structured data from URLs using LLMs.

Args:
urls: List of URLs to extract data from
schema: Optional JSON schema defining the structure of data to extract
prompt: Optional natural language prompt describing the data to extract

Returns:
Dictionary containing the extracted structured data
"""
params: Dict[str, Any] = {}

if prompt is not None:
params['prompt'] = prompt
elif schema is not None:
params['schema'] = schema

data = app.extract(urls, params)
return data


def map_website(url: str, search: Optional[str] = None):
"""
Map a website to get all URLs, with optional search functionality.

Args:
url: The base URL to map
search: Optional search term to filter URLs

Returns:
Dictionary containing the list of discovered URLs
"""
params = {'search': search} if search else {}
map_result = app.map_url(url, params)
return map_result


def batch_extract(urls: List[str], extract_params: Dict[str, Any]):
"""
Batch extract structured data from multiple URLs.

Args:
urls: List of URLs to extract data from
extract_params: Dictionary containing extraction parameters including prompt or schema

Returns:
Dictionary containing the extracted data from all URLs
"""
params = {
'formats': ['extract'],
'extract': extract_params
}

batch_result = app.batch_scrape_urls(urls, params)
return batch_result
11 changes: 10 additions & 1 deletion agentstack/_tools/firecrawl/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@
"dependencies": [
"firecrawl-py>=1.6.4"
],
"tools": ["web_scrape", "web_crawl", "retrieve_web_crawl"],
"tools": [
"web_scrape",
"web_crawl",
"retrieve_web_crawl",
"batch_scrape",
"check_batch_status",
"extract_data",
"map_website",
"batch_extract"
],
"cta": "Create an API key at https://www.firecrawl.dev/"
}