Skip to content

Commit 198f9e3

Browse files
committed
Changed the logic of fetching leader's data
1 parent c55e7ee commit 198f9e3

File tree

5 files changed

+32
-19
lines changed

5 files changed

+32
-19
lines changed

backend/apps/owasp/management/commands/owasp_scrape_chapters.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,10 @@ def handle(self, *args, **options):
6363
else:
6464
logger.info("Skipped related URL %s", verified_url)
6565

66-
chapter.leaders_raw = scraper.get_leaders()
67-
66+
repository = chapter.owasp_repository
67+
chapter.leaders_raw = scraper.get_leaders(repository)
6868
chapter.invalid_urls = sorted(invalid_urls)
6969
chapter.related_urls = sorted(related_urls)
70-
7170
chapters.append(chapter)
7271

7372
time.sleep(0.5)

backend/apps/owasp/management/commands/owasp_scrape_committees.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ def handle(self, *args, **options):
6363
else:
6464
logger.info("Skipped related URL %s", verified_url)
6565

66-
committee.leaders_raw = scraper.get_leaders()
67-
66+
repository = committee.owasp_repository
67+
committee.leaders_raw = scraper.get_leaders(repository)
6868
committee.invalid_urls = sorted(invalid_urls)
6969
committee.related_urls = sorted(related_urls)
7070

backend/apps/owasp/management/commands/owasp_scrape_projects.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,10 @@ def handle(self, *args, **options):
7575
else:
7676
logger.info("Skipped related URL %s", verified_url)
7777

78-
project.leaders_raw = scraper.get_leaders()
79-
78+
repository = project.owasp_repository
79+
project.leaders_raw = scraper.get_leaders(repository)
8080
project.invalid_urls = sorted(invalid_urls)
8181
project.related_urls = sorted(related_urls)
82-
8382
projects.append(project)
8483

8584
time.sleep(0.5)

backend/apps/owasp/scraper.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
"""OWASP scraper."""
22

33
import logging
4+
import re
45
from urllib.parse import urlparse
56

67
import requests
78
from lxml import etree, html
89
from requests.adapters import HTTPAdapter
910
from urllib3.util.retry import Retry
1011

12+
from apps.github.utils import get_repository_file_content
13+
1114
logger = logging.getLogger(__name__)
1215

1316
MAX_RETRIES = 3
@@ -55,15 +58,24 @@ def get_urls(self, domain=None):
5558
else self.page_tree.xpath("//div[@class='sidebar']//a/@href")
5659
)
5760

58-
def get_leaders(self):
59-
"""Get leaders."""
60-
leaders_header = self.page_tree.xpath("//div[@class='sidebar']//*[@id='leaders']")
61-
if leaders_header:
62-
leaders_ul = leaders_header[0].getnext()
63-
if leaders_ul is not None and leaders_ul.tag == "ul":
64-
return sorted(name.strip() for name in leaders_ul.xpath(".//li/a/text()"))
65-
66-
return []
61+
def get_leaders(self, repository):
62+
"""Get leaders from leaders.md file on GitHub."""
63+
content = get_repository_file_content(
64+
f"https://raw.githubusercontent.com/OWASP/{repository.key}/{repository.default_branch}/leaders.md"
65+
)
66+
leaders = []
67+
try:
68+
lines = content.split("\n")
69+
logger.debug("Content: %s", content)
70+
for line in lines:
71+
logger.debug("Processing line: %s", line)
72+
match = re.findall(r"\* \[([^\]]+)\]", line)
73+
leaders.extend(match)
74+
except AttributeError:
75+
logger.exception(
76+
"Unable to parse leaders.md content", extra={"repository": repository.name}
77+
)
78+
return leaders
6779

6880
def verify_url(self, url):
6981
"""Verify URL."""

backend/tests/owasp/scraper_test.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,13 @@ def test_get_leaders_no_leaders(self, mock_session):
113113
mock_response = Mock()
114114
mock_response.content = invalid_html
115115
mock_session.get.return_value = mock_response
116-
117116
scraper = OwaspScraper("https://test.org")
117+
mock_repository = Mock()
118+
mock_repository.key = "test-repo"
119+
mock_repository.default_branch = "main"
120+
mock_repository.name = "Test Repository"
118121

119-
assert scraper.get_leaders() == []
122+
assert scraper.get_leaders(mock_repository) == []
120123

121124
def test_verify_url_invalid_url(self, mock_session):
122125
response = Mock()

0 commit comments

Comments
 (0)