Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions croissant-rdf/src/croissant_rdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from croissant_rdf._src.providers.kaggle import KaggleHarvester
from croissant_rdf._src.providers.openml import OpenmlHarvester

__all__ = ["DataverseHarvester",
"HuggingfaceHarvester",
"KaggleHarvester",
"OpenmlHarvester"]
__all__ = [
"DataverseHarvester",
"HuggingfaceHarvester",
"KaggleHarvester",
"OpenmlHarvester",
]
28 changes: 21 additions & 7 deletions croissant-rdf/src/croissant_rdf/_src/croissant_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

DEFAULT_BASE_URL = "https://w3id.org/croissant-rdf/data/"


class CroissantHarvester(ABC):
"""Abstract base class for harvesting and processing Croissant metadata for datasets.

Expand Down Expand Up @@ -67,7 +68,9 @@ def fetch_dataset_croissant(self, dataset_id: str) -> requests.Response:
requests.Response: The response from the request to retrieve metadata."""
pass

def fetch_dataset_croissant_handler(self, dataset_id: str) -> Optional[Union[Dict, List]]:
def fetch_dataset_croissant_handler(
self, dataset_id: str
) -> Optional[Union[Dict, List]]:
"""Run the function to fetch Croissant JSON-LD from URL, and catch exceptions to return them as strings.

Args:
Expand All @@ -89,7 +92,6 @@ def fetch_dataset_croissant_handler(self, dataset_id: str) -> Optional[Union[Dic
return f"Empty error for {dataset_id}"
return f"Error for {dataset_id}: {e!s}"


def fetch_datasets_croissant(self) -> List[Dict]:
"""Fetch metadata for multiple datasets, using threading where applicable."""
try:
Expand All @@ -102,8 +104,15 @@ def fetch_datasets_croissant(self) -> List[Dict]:
errors = []
try:
with ThreadPoolExecutor(max_workers=4) as executor:
futures = {executor.submit(self.fetch_dataset_croissant_handler, dataset): dataset for dataset in datasets}
for future in track(as_completed(futures), "Fetching datasets metadata", len(futures)):
futures = {
executor.submit(
self.fetch_dataset_croissant_handler, dataset
): dataset
for dataset in datasets
}
for future in track(
as_completed(futures), "Fetching datasets metadata", len(futures)
):
result = future.result()
if isinstance(result, str):
errors.append(result)
Expand All @@ -115,7 +124,8 @@ def fetch_datasets_croissant(self) -> List[Dict]:
raise
if errors:
logger.warning(
f"Error fetching Croissant metadata JSON-LD for {len(errors)} URLs:\n" + "\n".join(errors)
f"Error fetching Croissant metadata JSON-LD for {len(errors)} URLs:\n"
+ "\n".join(errors)
)
return results

Expand Down Expand Up @@ -160,7 +170,9 @@ def generate_ttl(self) -> str:
Raises:
Exception: If there was an error generating the turtle file.
"""
logger.info(f"Searching {self.limit} datasets metadata{f' for `{self.search}`' if self.search else ''}.")
logger.info(
f"Searching {self.limit} datasets metadata{f' for `{self.search}`' if self.search else ''}."
)
try:
start_time = time.time()
datasets = self.fetch_datasets_croissant()
Expand All @@ -178,7 +190,9 @@ def generate_ttl(self) -> str:
@classmethod
def cli(cls):
"""Parse command-line arguments and generate a RDF file from harvested Croissant metadata."""
parser = argparse.ArgumentParser(description="Generate a RDF file from datasets Croissant metadata.")
parser = argparse.ArgumentParser(
description="Generate a RDF file from datasets Croissant metadata."
)
parser.add_argument(
"search",
type=str,
Expand Down
7 changes: 5 additions & 2 deletions croissant-rdf/src/croissant_rdf/_src/providers/dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@ def fetch_datasets_ids(self):
}
response = requests.get(search_url, params=params, timeout=30)
results = response.json()
return [result["global_id"] for result in results.get("data", {}).get("items", [])][: self.limit]
return [
result["global_id"] for result in results.get("data", {}).get("items", [])
][: self.limit]

def fetch_dataset_croissant(self, dataset_id: str):
# https://demo.dataverse.org/api/datasets/export?exporter=croissant&persistentId=doi:10.70122/FK2/JFASVV
return requests.get(
f"{self.api_url}/api/datasets/export?exporter=croissant&persistentId={dataset_id}", timeout=30
f"{self.api_url}/api/datasets/export?exporter=croissant&persistentId={dataset_id}",
timeout=30,
)


Expand Down
15 changes: 12 additions & 3 deletions croissant-rdf/src/croissant_rdf/_src/providers/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,23 @@

class HuggingfaceHarvester(CroissantHarvester):
api_url = "https://huggingface.co/api/datasets/"
headers = {"Authorization": f"Bearer {os.environ.get('HF_API_KEY')}"} if os.environ.get("HF_API_KEY") else {}
headers = (
{"Authorization": f"Bearer {os.environ.get('HF_API_KEY')}"}
if os.environ.get("HF_API_KEY")
else {}
)

def fetch_datasets_ids(self):
return [dataset.id for dataset in list(list_datasets(limit=self.limit, search=self.search))]
return [
dataset.id
for dataset in list(list_datasets(limit=self.limit, search=self.search))
]

def fetch_dataset_croissant(self, dataset_id: str):
url = self.api_url + dataset_id + "/croissant"
return requests.get(url, headers=self.headers if self.use_api_key else {}, timeout=30)
return requests.get(
url, headers=self.headers if self.use_api_key else {}, timeout=30
)
# resp_json = None
# try:
# response = requests.get(url, headers=self.headers if self.use_api_key else {}, timeout=30)
Expand Down
13 changes: 10 additions & 3 deletions croissant-rdf/src/croissant_rdf/_src/providers/kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,22 @@ def fetch_datasets_ids(self):
try:
from kaggle.api.kaggle_api_extended import KaggleApi
except Exception:
logger.warning("KAGGLE_USERNAME or KAGGLE_KEY are not set. Kaggle datasets IDs cannot be harvested.")
logger.warning(
"KAGGLE_USERNAME or KAGGLE_KEY are not set. Kaggle datasets IDs cannot be harvested."
)

api = KaggleApi()
api.authenticate()
# print([str(dataset) for dataset in api.dataset_list(search=self.search)[: self.limit]])
return [str(dataset) for dataset in api.dataset_list(search=self.search)[: self.limit]]
return [
str(dataset)
for dataset in api.dataset_list(search=self.search)[: self.limit]
]

def fetch_dataset_croissant(self, dataset_id: str):
return requests.get(self.api_url + str(dataset_id) + "/croissant/download", timeout=30)
return requests.get(
self.api_url + str(dataset_id) + "/croissant/download", timeout=30
)
# return response.json() if response.status_code == 200 else None


Expand Down
9 changes: 7 additions & 2 deletions croissant-rdf/src/croissant_rdf/_src/providers/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,20 @@ class OpenmlHarvester(CroissantHarvester):
def fetch_datasets_ids(self):
datalist = openml.datasets.list_datasets(output_format="dataframe")
if self.search:
datalist = datalist[datalist["name"].str.contains(self.search, case=False, na=False)]
datalist = datalist[
datalist["name"].str.contains(self.search, case=False, na=False)
]
return datalist["did"].astype(str).tolist()[: self.limit]

def fetch_dataset_croissant(self, dataset_id: str):
# 3 = https://data.openml.org/datasets/0000/0003/dataset_3_croissant.json
# 44593 = https://data.openml.org/datasets/0004/44593/dataset_44593_croissant.json
extended_id = f"{int(dataset_id):08d}"
extended_id = f"{extended_id[:4]}/{extended_id[4:] if len(dataset_id) < 5 else dataset_id}"
return requests.get(self.api_url + extended_id + f"/dataset_{dataset_id}_croissant.json", timeout=30)
return requests.get(
self.api_url + extended_id + f"/dataset_{dataset_id}_croissant.json",
timeout=30,
)


def main():
Expand Down
9 changes: 5 additions & 4 deletions croissant-rdf/tests/test_fetch_dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
import tempfile



def test_dataverse():

with tempfile.NamedTemporaryFile(mode="w+b", suffix=".ttl", delete_on_close=False) as fp:

with tempfile.NamedTemporaryFile(
mode="w+b", suffix=".ttl", delete_on_close=False
) as fp:

harvester = DataverseHarvester(
fname=fp.name,
Expand All @@ -19,4 +20,4 @@ def test_dataverse():
harvester.generate_ttl()

g = Graph().parse(fp.name, format="ttl")
assert len(g) > 0
assert len(g) > 0
10 changes: 7 additions & 3 deletions croissant-rdf/tests/test_fetch_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from tempfile import NamedTemporaryFile


@pytest.fixture
def mock_response():
mock = MagicMock()
Expand All @@ -21,7 +22,9 @@ def test_mock_croissant_dataset(mock_response):
result = harvester.fetch_dataset_croissant("test_dataset").json()

mock_get.assert_called_once_with(
"https://huggingface.co/api/datasets/test_dataset/croissant", headers=ANY, timeout=30
"https://huggingface.co/api/datasets/test_dataset/croissant",
headers=ANY,
timeout=30,
)
assert result == {"name": "test_dataset", "description": "A test dataset"}

Expand All @@ -38,7 +41,9 @@ def test_mock_fetch_datasets(mock_response):


def test_mock_fetch_datasets_empty():
with patch("croissant_rdf.HuggingfaceHarvester.fetch_dataset_croissant", return_value=[]):
with patch(
"croissant_rdf.HuggingfaceHarvester.fetch_dataset_croissant", return_value=[]
):
harvester = HuggingfaceHarvester(limit=0)
result = harvester.fetch_datasets_croissant()
assert result == []
Expand Down Expand Up @@ -68,7 +73,6 @@ def test_fetch_data_workflow():
assert "http://mlcommons.org/croissant/" in dataset["@context"]["cr"]



def test_generate_ttl():
"""Test the complete generate_ttl workflow."""
with NamedTemporaryFile(mode="w+b", suffix=".ttl", delete_on_close=False) as fp:
Expand Down
18 changes: 14 additions & 4 deletions croissant-rdf/tests/test_fetch_kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,17 @@ def test_mock_croissant_dataset(mock_response):
harvester = KaggleHarvester(limit=1)
result = harvester.fetch_dataset_croissant("test_dataset").json()

mock_get.assert_called_once_with("https://www.kaggle.com/datasets/test_dataset/croissant/download", timeout=30)
mock_get.assert_called_once_with(
"https://www.kaggle.com/datasets/test_dataset/croissant/download",
timeout=30,
)
assert result == test_metadata_kaggle


def test_mock_fetch_datasets(mock_response):
with patch.object(KaggleHarvester, "fetch_datasets_ids", return_value=["test_dataset"]), patch(
with patch.object(
KaggleHarvester, "fetch_datasets_ids", return_value=["test_dataset"]
), patch(
"requests.get",
return_value=mock_response,
):
Expand All @@ -41,10 +46,15 @@ def test_mock_fetch_datasets(mock_response):
assert len(result) == 1
assert result[0] == test_metadata_kaggle


def test_generate_ttl(mock_response):
"""Test the complete generate_ttl workflow."""
with tempfile.NamedTemporaryFile(mode="w+b", suffix=".ttl", delete_on_close=False) as fp:
with patch.object(KaggleHarvester, "fetch_datasets_ids", return_value=["test_dataset"]), patch(
with tempfile.NamedTemporaryFile(
mode="w+b", suffix=".ttl", delete_on_close=False
) as fp:
with patch.object(
KaggleHarvester, "fetch_datasets_ids", return_value=["test_dataset"]
), patch(
"requests.get",
return_value=mock_response,
):
Expand Down
7 changes: 5 additions & 2 deletions croissant-rdf/tests/test_fetch_openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@

from croissant_rdf import OpenmlHarvester


def test_openml():
with tempfile.NamedTemporaryFile(mode="w+b", suffix=".ttl", delete_on_close=False) as fp:
with tempfile.NamedTemporaryFile(
mode="w+b", suffix=".ttl", delete_on_close=False
) as fp:

harvester = OpenmlHarvester(fname=fp.name, limit=5, search="blood")
harvester.generate_ttl()

assert os.path.isfile(fp.name)
g = Graph().parse(fp.name, format="ttl")
assert len(g) > 0
assert len(g) > 0
13 changes: 10 additions & 3 deletions croissant-rdf/tests/test_generate_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from croissant_rdf import HuggingfaceHarvester


def test_convert_to_rdf_mock_data():
"""Test with mock data"""
data = [
Expand All @@ -30,7 +31,9 @@ def test_convert_to_rdf_mock_data():
"name": "test_dataset_3",
},
]
with tempfile.NamedTemporaryFile(mode="w+b", suffix=".ttl", delete_on_close=False) as fp:
with tempfile.NamedTemporaryFile(
mode="w+b", suffix=".ttl", delete_on_close=False
) as fp:

harvester = HuggingfaceHarvester(fname=fp.name)
file_ttl = harvester.convert_to_rdf(data)
Expand All @@ -45,7 +48,9 @@ def test_convert_to_rdf_mock_data():
def test_convert_to_rdf_mock_data_empty():
"""Test with empty data"""
data = []
with tempfile.NamedTemporaryFile(mode="w+b", suffix=".ttl", delete_on_close=False) as fp:
with tempfile.NamedTemporaryFile(
mode="w+b", suffix=".ttl", delete_on_close=False
) as fp:
harvester = HuggingfaceHarvester(fname=fp.name)
harvester.convert_to_rdf(data)
assert os.path.isfile(fp.name)
Expand All @@ -55,7 +60,9 @@ def test_convert_to_rdf_mock_data_empty():

def test_convert_to_rdf_real_data():
"""Test data from HuggingFace, does not require API key"""
with tempfile.NamedTemporaryFile(mode="w+b", suffix=".ttl", delete_on_close=False) as fp:
with tempfile.NamedTemporaryFile(
mode="w+b", suffix=".ttl", delete_on_close=False
) as fp:
harvester = HuggingfaceHarvester(fname=fp.name, limit=5)
data = harvester.fetch_datasets_croissant()
harvester.convert_to_rdf(data)
Expand Down
7 changes: 1 addition & 6 deletions eclair/src/eclair/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,4 @@
# e.g., from eclair.server import EclairServer
# e.g., from eclair.client import EclairClient

__all__ = [
"__version__",
"__author__",
"__email__",
"__description__"
]
__all__ = ["__version__", "__author__", "__email__", "__description__"]
2 changes: 1 addition & 1 deletion eclair/src/eclair/client/claude/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
ClaudeMCPClient = None
CLAUDE_AVAILABLE = False

__all__ = ['ClaudeMCPClient', 'CLAUDE_AVAILABLE']
__all__ = ["ClaudeMCPClient", "CLAUDE_AVAILABLE"]
Loading
Loading