ScrapeGraphAI
diff --git a/‎scrapegraph-js/README.md‎
Lines changed: 99 additions & 0 deletions b/‎scrapegraph-js/README.md‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎scrapegraph-js/examples/crawl_example.js‎
Lines changed: 105 additions & 0 deletions b/‎scrapegraph-js/examples/crawl_example.js‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎scrapegraph-js/index.js‎
Lines changed: 1 addition & 0 deletions b/‎scrapegraph-js/index.js‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scrapegraph-js/src/crawl.js‎
Lines changed: 93 additions & 0 deletions b/‎scrapegraph-js/src/crawl.js‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎scrapegraph-py/examples/async/.env.example‎
Lines changed: 1 addition & 0 deletions b/‎scrapegraph-py/examples/async/.env.example‎
Lines changed: 1 addition & 0 deletions
@@ -151,6 +151,105 @@ const prompt = 'What is the latest version of Python and what are its main featu
 })();
 ```
 
+### Crawl API
+
+Start a crawl job to extract structured data from a website and its linked pages, using a custom schema.
+
+```javascript
+import { crawl, getCrawlRequest } from 'scrapegraph-js';
+import 'dotenv/config';
+
+const apiKey = process.env.SGAI_APIKEY;
+const url = 'https://scrapegraphai.com/';
+const prompt = 'What does the company do? and I need text content from there privacy and terms';
+
+const schema = {
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "ScrapeGraphAI Website Content",
+  "type": "object",
+  "properties": {
+    "company": {
+      "type": "object",
+      "properties": {
+        "name": { "type": "string" },
+        "description": { "type": "string" },
+        "features": { "type": "array", "items": { "type": "string" } },
+        "contact_email": { "type": "string", "format": "email" },
+        "social_links": {
+          "type": "object",
+          "properties": {
+            "github": { "type": "string", "format": "uri" },
+            "linkedin": { "type": "string", "format": "uri" },
+            "twitter": { "type": "string", "format": "uri" }
+          },
+          "additionalProperties": false
+        }
+      },
+      "required": ["name", "description"]
+    },
+    "services": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "service_name": { "type": "string" },
+          "description": { "type": "string" },
+          "features": { "type": "array", "items": { "type": "string" } }
+        },
+        "required": ["service_name", "description"]
+      }
+    },
+    "legal": {
+      "type": "object",
+      "properties": {
+        "privacy_policy": { "type": "string" },
+        "terms_of_service": { "type": "string" }
+      },
+      "required": ["privacy_policy", "terms_of_service"]
+    }
+  },
+  "required": ["company", "services", "legal"]
+};
+
+(async () => {
+  try {
+    // Start the crawl job
+    const crawlResponse = await crawl(apiKey, url, prompt, schema, {
+      cacheWebsite: true,
+      depth: 2,
+      maxPages: 2,
+      sameDomainOnly: true,
+      batchSize: 1,
+    });
+    console.log('Crawl job started. Response:', crawlResponse);
+
+    // If the crawl is asynchronous and returns an ID, fetch the result
+    const crawlId = crawlResponse.id || crawlResponse.task_id;
+    if (crawlId) {
+      for (let i = 0; i < 10; i++) {
+        await new Promise((resolve) => setTimeout(resolve, 5000));
+        const result = await getCrawlRequest(apiKey, crawlId);
+        if (result.status === 'success' && result.result) {
+          console.log('Crawl completed. Result:', result.result.llm_result);
+          break;
+        } else if (result.status === 'failed') {
+          console.log('Crawl failed. Result:', result);
+          break;
+        } else {
+          console.log(`Status: ${result.status}, waiting...`);
+        }
+      }
+    } else {
+      console.log('No crawl ID found in response. Synchronous result:', crawlResponse);
+    }
+  } catch (error) {
+    console.error('Error occurred:', error);
+  }
+})();
+```
+
+You can use a plain JSON schema or a [Zod](https://www.npmjs.com/package/zod) schema for the `schema` parameter. The crawl API supports options for crawl depth, max pages, domain restriction, and batch size.
+
 ### Scraping local HTML
 
 Extract structured data from local HTML content
 
@@ -0,0 +1,105 @@
+import { crawl, getCrawlRequest } from '../index.js';
+import 'dotenv/config';
+
+// Example .env file:
+// SGAI_APIKEY=your_sgai_api_key
+
+const apiKey = process.env.SGAI_APIKEY;
+
+const schema = {
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "ScrapeGraphAI Website Content",
+  "type": "object",
+  "properties": {
+    "company": {
+      "type": "object",
+      "properties": {
+        "name": { "type": "string" },
+        "description": { "type": "string" },
+        "features": { "type": "array", "items": { "type": "string" } },
+        "contact_email": { "type": "string", "format": "email" },
+        "social_links": {
+          "type": "object",
+          "properties": {
+            "github": { "type": "string", "format": "uri" },
+            "linkedin": { "type": "string", "format": "uri" },
+            "twitter": { "type": "string", "format": "uri" }
+          },
+          "additionalProperties": false
+        }
+      },
+      "required": ["name", "description"]
+    },
+    "services": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "service_name": { "type": "string" },
+          "description": { "type": "string" },
+          "features": { "type": "array", "items": { "type": "string" } }
+        },
+        "required": ["service_name", "description"]
+      }
+    },
+    "legal": {
+      "type": "object",
+      "properties": {
+        "privacy_policy": { "type": "string" },
+        "terms_of_service": { "type": "string" }
+      },
+      "required": ["privacy_policy", "terms_of_service"]
+    }
+  },
+  "required": ["company", "services", "legal"]
+};
+
+const url = 'https://scrapegraphai.com/';
+const prompt = 'What does the company do? and I need text content from there privacy and terms';
+
+(async () => {
+  if (!apiKey) {
+    console.error('SGAI_APIKEY not found in environment. Please set it in your .env file.');
+    process.exit(1);
+  }
+
+  try {
+    // Start the crawl job
+    console.log(`\nStarting crawl for: ${url}`);
+    const crawlResponse = await crawl(apiKey, url, prompt, schema, {
+      cacheWebsite: true,
+      depth: 2,
+      maxPages: 2,
+      sameDomainOnly: true,
+      batchSize: 1,
+    });
+    console.log('\nCrawl job started. Response:');
+    console.log(JSON.stringify(crawlResponse, null, 2));
+
+    // If the crawl is asynchronous and returns an ID, fetch the result
+    const crawlId = crawlResponse.id || crawlResponse.task_id;
+    if (crawlId) {
+      console.log('\nPolling for crawl result...');
+      for (let i = 0; i < 10; i++) {
+        await new Promise((resolve) => setTimeout(resolve, 5000));
+        const result = await getCrawlRequest(apiKey, crawlId);
+        if (result.status === 'success' && result.result) {
+          console.log(`\nCrawl completed. Result:`);
+          console.log(JSON.stringify(result.result.llm_result, null, 2));
+          break;
+        } else if (result.status === 'failed') {
+          console.log('\nCrawl failed. Result:');
+          console.log(JSON.stringify(result, null, 2));
+          break;
+        } else {
+          console.log(`Status: ${result.status}, waiting...`);
+        }
+      }
+    } else {
+      console.log('No crawl ID found in response. Synchronous result:');
+      console.log(JSON.stringify(crawlResponse, null, 2));
+    }
+  } catch (error) {
+    console.error('Error occurred:', error);
+  }
+})(); 
@@ -3,3 +3,4 @@ export { markdownify, getMarkdownifyRequest } from './src/markdownify.js';
 export { searchScraper, getSearchScraperRequest } from './src/searchScraper.js';
 export { getCredits } from './src/credits.js';
 export { sendFeedback } from './src/feedback.js';
+export { crawl, getCrawlRequest } from './src/crawl.js';
@@ -0,0 +1,93 @@
+import axios from 'axios';
+import handleError from './utils/handleError.js';
+import { ZodType } from 'zod';
+import { zodToJsonSchema } from 'zod-to-json-schema';
+
+/**
+ * Start a crawl job using the ScrapeGraphAI API.
+ *
+ * @param {string} apiKey - Your ScrapeGraph AI API key
+ * @param {string} url - The starting URL for the crawl
+ * @param {string} prompt - The prompt to guide the crawl and extraction
+ * @param {Object|ZodType} schema - JSON schema or Zod schema defining the structure of the extracted data
+ * @param {Object} [options] - Optional crawl parameters
+ * @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
+ * @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
+ * @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
+ * @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
+ * @param {number} [options.batchSize=1] - Batch size for processing pages (1-10)
+ * @returns {Promise<Object>} The crawl job response
+ * @throws {Error} Throws an error if the HTTP request fails
+ */
+export async function crawl(
+  apiKey,
+  url,
+  prompt,
+  schema,
+  options = {}
+) {
+  const endpoint = 'https://api.scrapegraphai.com/v1/crawl';
+  const headers = {
+    'accept': 'application/json',
+    'SGAI-APIKEY': apiKey,
+    'Content-Type': 'application/json',
+  };
+
+  let schemaPayload;
+  if (schema instanceof ZodType) {
+    schemaPayload = zodToJsonSchema(schema);
+  } else if (typeof schema === 'object' && schema !== null) {
+    schemaPayload = schema;
+  } else {
+    throw new Error('The schema must be a Zod schema or a plain object');
+  }
+
+  const {
+    cacheWebsite = true,
+    depth = 2,
+    maxPages = 2,
+    sameDomainOnly = true,
+    batchSize = 1,
+  } = options;
+
+  const payload = {
+    url,
+    prompt,
+    schema: schemaPayload,
+    cache_website: cacheWebsite,
+    depth,
+    max_pages: maxPages,
+    same_domain_only: sameDomainOnly,
+    batch_size: batchSize,
+  };
+
+  try {
+    const response = await axios.post(endpoint, payload, { headers });
+    return response.data;
+  } catch (error) {
+    handleError(error);
+  }
+}
+
+/**
+ * Get the result of a crawl job by ID.
+ *
+ * @param {string} apiKey - Your ScrapeGraph AI API key
+ * @param {string} crawlId - The crawl job ID
+ * @returns {Promise<Object>} The crawl result
+ * @throws {Error} Throws an error if the HTTP request fails
+ */
+export async function getCrawlRequest(apiKey, crawlId) {
+  const endpoint = `https://api.scrapegraphai.com/v1/crawl/${crawlId}`;
+  const headers = {
+    'accept': 'application/json',
+    'SGAI-APIKEY': apiKey,
+  };
+
+  try {
+    const response = await axios.get(endpoint, { headers });
+    return response.data;
+  } catch (error) {
+    handleError(error);
+  }
+} 
@@ -0,0 +1 @@
+SGAI_API_KEY="your_sgai_api_key"