Skip to content
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all entities it's no longer a part of scraping process. It's now within GitHub processing part. We need to structure it properly.

Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,10 @@ def handle(self, *args, **options):
else:
logger.info("Skipped related URL %s", verified_url)

chapter.leaders_raw = scraper.get_leaders()

repository = chapter.owasp_repository
chapter.leaders_raw = chapter.get_leaders(repository)
chapter.invalid_urls = sorted(invalid_urls)
chapter.related_urls = sorted(related_urls)

chapters.append(chapter)

time.sleep(0.5)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def handle(self, *args, **options):
else:
logger.info("Skipped related URL %s", verified_url)

committee.leaders_raw = scraper.get_leaders()

repository = committee.owasp_repository
committee.leaders_raw = committee.get_leaders(repository)
committee.invalid_urls = sorted(invalid_urls)
committee.related_urls = sorted(related_urls)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,10 @@ def handle(self, *args, **options):
else:
logger.info("Skipped related URL %s", verified_url)

project.leaders_raw = scraper.get_leaders()

repository = project.owasp_repository
project.leaders_raw = project.get_leaders(repository)
project.invalid_urls = sorted(invalid_urls)
project.related_urls = sorted(related_urls)

projects.append(project)

time.sleep(0.5)
Expand Down
34 changes: 34 additions & 0 deletions backend/apps/owasp/models/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from urllib.parse import urlparse

import requests
import yaml
from django.db import models
from django.db.models import Sum
Expand Down Expand Up @@ -146,6 +147,39 @@ def get_top_contributors(self, repositories=()):
.order_by("-total_contributions")[:TOP_CONTRIBUTORS_LIMIT]
]

def get_leaders(self, repository):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add tests for the new method.

"""Get leaders from leaders.md file on GitHub."""
try:
content = get_repository_file_content(
f"https://raw.githubusercontent.com/OWASP/{repository.key}/{repository.default_branch}/leaders.md"
)
except (requests.exceptions.RequestException, ValueError) as e:
logger.exception(
"Failed to fetch leaders.md file",
extra={"repository": repository.name, "error": str(e)},
)
return []
leaders = []
try:
if not content:
logger.warning("Empty leaders.md content", extra={"repository": repository.name})
return leaders
lines = content.split("\n")
logger.debug("Content length: %d characters", len(content))
small_size = 500
if len(content) < small_size: # Only log full content if it's reasonably small
logger.debug("Content: %s", content)
for line in lines:
logger.debug("Processing line: %s", line)
# Match both standard Markdown list items with links and variations
match = re.findall(r"\*\s*\[([^\]]+)\](?:\([^)]*\))?", line)
leaders.extend(match)
except AttributeError:
logger.exception(
"Unable to parse leaders.md content", extra={"repository": repository.name}
)
return leaders


class GenericEntityModel(models.Model):
"""Generic entity model."""
Expand Down
10 changes: 0 additions & 10 deletions backend/apps/owasp/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,6 @@ def get_urls(self, domain=None):
else self.page_tree.xpath("//div[@class='sidebar']//a/@href")
)

def get_leaders(self):
"""Get leaders."""
leaders_header = self.page_tree.xpath("//div[@class='sidebar']//*[@id='leaders']")
if leaders_header:
leaders_ul = leaders_header[0].getnext()
if leaders_ul is not None and leaders_ul.tag == "ul":
return sorted(name.strip() for name in leaders_ul.xpath(".//li/a/text()"))

return []

def verify_url(self, url):
"""Verify URL."""
location = urlparse(url).netloc.lower()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def test_handle(self, mock_bulk_save, command, mock_chapter, offset, chapters):
"https://invalid.com/repo3",
]
mock_scraper.verify_url.side_effect = lambda url: None if "invalid" in url else url
mock_scraper.get_leaders.return_value = "Leaders data"
mock_scraper.page_tree = True

mock_chapter.get_related_url.side_effect = lambda url, **_: url
Expand Down Expand Up @@ -91,4 +90,3 @@ def test_handle(self, mock_bulk_save, command, mock_chapter, offset, chapters):
expected_related_urls = ["https://example.com/repo1", "https://example.com/repo2"]
assert chapter.invalid_urls == sorted(expected_invalid_urls)
assert chapter.related_urls == sorted(expected_related_urls)
assert chapter.leaders_raw == "Leaders data"
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def test_handle(self, mock_bulk_save, command, mock_committee, offset, committee
"https://invalid.com/repo3",
]
mock_scraper.verify_url.side_effect = lambda url: None if "invalid" in url else url
mock_scraper.get_leaders.return_value = "Leaders data"
mock_scraper.page_tree = True

mock_committee.get_related_url.side_effect = lambda url, **_: url
Expand Down Expand Up @@ -91,4 +90,3 @@ def test_handle(self, mock_bulk_save, command, mock_committee, offset, committee
expected_related_urls = ["https://example.com/repo1", "https://example.com/repo2"]
assert committee.invalid_urls == sorted(expected_invalid_urls)
assert committee.related_urls == sorted(expected_related_urls)
assert committee.leaders_raw == "Leaders data"
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ def test_handle(self, mock_github, mock_bulk_save, command, mock_project, offset
"https://invalid.com/repo3",
]
mock_scraper.verify_url.side_effect = lambda url: None if "invalid" in url else url
mock_scraper.get_leaders.return_value = "Leaders data"
mock_scraper.page_tree = True

mock_github_instance = mock.Mock()
Expand Down Expand Up @@ -107,4 +106,3 @@ def test_handle(self, mock_github, mock_bulk_save, command, mock_project, offset
]
assert project.invalid_urls == sorted(expected_invalid_urls)
assert project.related_urls == sorted(expected_related_urls)
assert project.leaders_raw == "Leaders data"
17 changes: 17 additions & 0 deletions backend/tests/owasp/models/chapter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,20 @@ def test_from_github(self):
)

assert chapter.name == repository_mock.title

@patch("apps.owasp.models.common.get_repository_file_content")
def test_get_leaders(self, mock_get_content):
# Mock the content of the leaders.md file
mock_get_content.return_value = (
"* [Leader1](https://github.com/leader1)\n* [Leader2](https://github.com/leader2)"
)

# Create a mock repository
repository = Repository(key="test-repo", default_branch="main")

# Instantiate the model and call the get_leaders method
chapter = Chapter()
leaders = chapter.get_leaders(repository)

# Assert the leaders list
assert leaders == ["Leader1", "Leader2"]
17 changes: 17 additions & 0 deletions backend/tests/owasp/models/committee_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,20 @@ def test_from_github(self):
)

assert committee.name == repository_mock.title

@patch("apps.owasp.models.common.get_repository_file_content")
def test_get_leaders(self, mock_get_content):
# Mock the content of the leaders.md file
mock_get_content.return_value = (
"* [Leader1](https://github.com/leader1)\n* [Leader2](https://github.com/leader2)"
)

# Create a mock repository
repository = Repository(key="test-repo", default_branch="main")

# Instantiate the model and call the get_leaders method
committee = Committee()
leaders = committee.get_leaders(repository)

# Assert the leaders list
assert leaders == ["Leader1", "Leader2"]
17 changes: 17 additions & 0 deletions backend/tests/owasp/models/project_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,20 @@ def test_from_github(self):

assert project.level == Project.ProjectLevel.LAB
assert project.type == Project.ProjectType.TOOL

@patch("apps.owasp.models.common.get_repository_file_content")
def test_get_leaders(self, mock_get_content):
# Mock the content of the leaders.md file
mock_get_content.return_value = (
"* [Leader1](https://github.com/leader1)\n* [Leader2](https://github.com/leader2)"
)

# Create a mock repository
repository = Repository(key="test-repo", default_branch="main")

# Instantiate the model and call the get_leaders method
project = Project()
leaders = project.get_leaders(repository)

# Assert the leaders list
assert leaders == ["Leader1", "Leader2"]
10 changes: 0 additions & 10 deletions backend/tests/owasp/scraper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,16 +108,6 @@ def test_initialization_not_found(self, mock_session):

assert scraper.page_tree is None

def test_get_leaders_no_leaders(self, mock_session):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this test needs to be refactored too based on the new logic and the code location.

invalid_html = b"<div class='sidebar'><div id='leaders'></div></div>"
mock_response = Mock()
mock_response.content = invalid_html
mock_session.get.return_value = mock_response

scraper = OwaspScraper("https://test.org")

assert scraper.get_leaders() == []

def test_verify_url_invalid_url(self, mock_session):
response = Mock()
response.status_code = codes.ok
Expand Down