|
1 | 1 | from bs4 import BeautifulSoup
|
2 | 2 | import os
|
| 3 | +from scrapegraph_py import ScrapeGraphClient, scrape_text |
| 4 | +from dotenv import load_dotenv |
3 | 5 |
|
4 |
| -def scrape_local_html(file_path): |
| 6 | +def scrape_local_html(client: ScrapeGraphClient, file_path: str, prompt: str): |
5 | 7 | """
|
6 |
| - Scrape content from a local HTML file. |
| 8 | + Scrape content from a local HTML file using ScrapeGraph AI. |
7 | 9 |
|
8 | 10 | Args:
|
| 11 | + client (ScrapeGraphClient): Initialized ScrapeGraph client |
9 | 12 | file_path (str): Path to the local HTML file
|
| 13 | + prompt (str): Natural language prompt describing what to extract |
10 | 14 |
|
11 | 15 | Returns:
|
12 |
| - dict: Extracted data from the HTML file |
| 16 | + str: Extracted data in JSON format |
13 | 17 | """
|
14 |
| - # Check if file exists |
15 | 18 | if not os.path.exists(file_path):
|
16 | 19 | raise FileNotFoundError(f"HTML file not found at: {file_path}")
|
17 | 20 |
|
18 |
| - # Read the HTML file |
19 | 21 | with open(file_path, 'r', encoding='utf-8') as file:
|
20 | 22 | html_content = file.read()
|
21 | 23 |
|
22 |
| - # Parse HTML with BeautifulSoup |
| 24 | + # Use BeautifulSoup to extract text content |
23 | 25 | soup = BeautifulSoup(html_content, 'html.parser')
|
| 26 | + text_content = soup.get_text(separator='\n', strip=True) |
24 | 27 |
|
25 |
| - # Example extraction - modify based on your HTML structure |
26 |
| - data = { |
27 |
| - 'title': soup.title.string if soup.title else None, |
28 |
| - 'paragraphs': [p.text for p in soup.find_all('p')], |
29 |
| - 'links': [{'text': a.text, 'href': a.get('href')} for a in soup.find_all('a')], |
30 |
| - 'headers': [h.text for h in soup.find_all(['h1', 'h2', 'h3'])] |
31 |
| - } |
32 |
| - |
33 |
| - return data |
| 28 | + # Use ScrapeGraph AI to analyze the text |
| 29 | + return scrape_text(client, text_content, prompt) |
34 | 30 |
|
35 | 31 | def main():
|
36 |
| - # Example usage |
| 32 | + load_dotenv() |
| 33 | + api_key = os.getenv("SCRAPEGRAPH_API_KEY") |
| 34 | + client = ScrapeGraphClient(api_key) |
| 35 | + |
37 | 36 | try:
|
38 |
| - # Assuming you have a sample.html file in the same directory |
39 |
| - result = scrape_local_html('sample.html') |
40 |
| - |
41 |
| - # Print extracted data |
42 |
| - print("Title:", result['title']) |
43 |
| - print("\nParagraphs:") |
44 |
| - for p in result['paragraphs']: |
45 |
| - print(f"- {p}") |
46 |
| - |
47 |
| - print("\nLinks:") |
48 |
| - for link in result['links']: |
49 |
| - print(f"- {link['text']}: {link['href']}") |
50 |
| - |
51 |
| - print("\nHeaders:") |
52 |
| - for header in result['headers']: |
53 |
| - print(f"- {header}") |
| 37 | + result = scrape_local_html( |
| 38 | + client, |
| 39 | + 'sample.html', |
| 40 | + "Extract main content and important information" |
| 41 | + ) |
| 42 | + print("Extracted Data:", result) |
54 | 43 |
|
55 | 44 | except FileNotFoundError as e:
|
56 | 45 | print(f"Error: {e}")
|
|
0 commit comments