1
+ import { crawl , getCrawlRequest } from '../index.js' ;
2
+ import 'dotenv/config' ;
3
+
4
+ // Example .env file:
5
+ // SGAI_APIKEY=your_sgai_api_key
6
+
7
+ const apiKey = process . env . SGAI_APIKEY ;
8
+
9
+ const schema = {
10
+ "$schema" : "http://json-schema.org/draft-07/schema#" ,
11
+ "title" : "ScrapeGraphAI Website Content" ,
12
+ "type" : "object" ,
13
+ "properties" : {
14
+ "company" : {
15
+ "type" : "object" ,
16
+ "properties" : {
17
+ "name" : { "type" : "string" } ,
18
+ "description" : { "type" : "string" } ,
19
+ "features" : { "type" : "array" , "items" : { "type" : "string" } } ,
20
+ "contact_email" : { "type" : "string" , "format" : "email" } ,
21
+ "social_links" : {
22
+ "type" : "object" ,
23
+ "properties" : {
24
+ "github" : { "type" : "string" , "format" : "uri" } ,
25
+ "linkedin" : { "type" : "string" , "format" : "uri" } ,
26
+ "twitter" : { "type" : "string" , "format" : "uri" }
27
+ } ,
28
+ "additionalProperties" : false
29
+ }
30
+ } ,
31
+ "required" : [ "name" , "description" ]
32
+ } ,
33
+ "services" : {
34
+ "type" : "array" ,
35
+ "items" : {
36
+ "type" : "object" ,
37
+ "properties" : {
38
+ "service_name" : { "type" : "string" } ,
39
+ "description" : { "type" : "string" } ,
40
+ "features" : { "type" : "array" , "items" : { "type" : "string" } }
41
+ } ,
42
+ "required" : [ "service_name" , "description" ]
43
+ }
44
+ } ,
45
+ "legal" : {
46
+ "type" : "object" ,
47
+ "properties" : {
48
+ "privacy_policy" : { "type" : "string" } ,
49
+ "terms_of_service" : { "type" : "string" }
50
+ } ,
51
+ "required" : [ "privacy_policy" , "terms_of_service" ]
52
+ }
53
+ } ,
54
+ "required" : [ "company" , "services" , "legal" ]
55
+ } ;
56
+
57
+ const url = 'https://scrapegraphai.com/' ;
58
+ const prompt = 'What does the company do? and I need text content from there privacy and terms' ;
59
+
60
+ ( async ( ) => {
61
+ if ( ! apiKey ) {
62
+ console . error ( 'SGAI_APIKEY not found in environment. Please set it in your .env file.' ) ;
63
+ process . exit ( 1 ) ;
64
+ }
65
+
66
+ try {
67
+ // Start the crawl job
68
+ console . log ( `\nStarting crawl for: ${ url } ` ) ;
69
+ const crawlResponse = await crawl ( apiKey , url , prompt , schema , {
70
+ cacheWebsite : true ,
71
+ depth : 2 ,
72
+ maxPages : 2 ,
73
+ sameDomainOnly : true ,
74
+ batchSize : 1 ,
75
+ } ) ;
76
+ console . log ( '\nCrawl job started. Response:' ) ;
77
+ console . log ( JSON . stringify ( crawlResponse , null , 2 ) ) ;
78
+
79
+ // If the crawl is asynchronous and returns an ID, fetch the result
80
+ const crawlId = crawlResponse . id || crawlResponse . task_id ;
81
+ if ( crawlId ) {
82
+ console . log ( '\nPolling for crawl result...' ) ;
83
+ for ( let i = 0 ; i < 10 ; i ++ ) {
84
+ await new Promise ( ( resolve ) => setTimeout ( resolve , 5000 ) ) ;
85
+ const result = await getCrawlRequest ( apiKey , crawlId ) ;
86
+ if ( result . status === 'success' && result . result ) {
87
+ console . log ( `\nCrawl completed. Result:` ) ;
88
+ console . log ( JSON . stringify ( result . result . llm_result , null , 2 ) ) ;
89
+ break ;
90
+ } else if ( result . status === 'failed' ) {
91
+ console . log ( '\nCrawl failed. Result:' ) ;
92
+ console . log ( JSON . stringify ( result , null , 2 ) ) ;
93
+ break ;
94
+ } else {
95
+ console . log ( `Status: ${ result . status } , waiting...` ) ;
96
+ }
97
+ }
98
+ } else {
99
+ console . log ( 'No crawl ID found in response. Synchronous result:' ) ;
100
+ console . log ( JSON . stringify ( crawlResponse , null , 2 ) ) ;
101
+ }
102
+ } catch ( error ) {
103
+ console . error ( 'Error occurred:' , error ) ;
104
+ }
105
+ } ) ( ) ;
0 commit comments