diff --git a/agentic_security/probe_data/data.py b/agentic_security/probe_data/data.py index 023151b..af1cbe3 100644 --- a/agentic_security/probe_data/data.py +++ b/agentic_security/probe_data/data.py @@ -31,12 +31,53 @@ TransformFn = Callable[[str], str] # Core data loading utilities def fetch_csv_content(url: str) -> str: - """Fetch CSV content from a URL.""" - response = httpx.get(url) + """Fetch CSV content from a URL. + + Handles Google Sheets share links by converting them to the CSV export URL. + Accepts both the edit link format and the /pub?output=csv format. + """ + url = _normalize_google_sheets_url(url) + response = httpx.get(url, follow_redirects=True) response.raise_for_status() # Raise exception for bad responses return response.content.decode("utf-8") +def _normalize_google_sheets_url(url: str) -> str: + """Convert a Google Sheets share/edit URL to a CSV export URL if needed. + + Supports the following formats: + - https://docs.google.com/spreadsheets/d//edit#gid= + - https://docs.google.com/spreadsheets/d//pub?output=csv (already correct) + - https://docs.google.com/spreadsheets/d//export?format=csv (already correct) + + Returns the URL unchanged for non-Google-Sheets links. + """ + import re + + match = re.match( + r"https://docs\.google\.com/spreadsheets/d/([^/]+)(?:/[^?#]*)?(?:[?#].*)?$", + url, + ) + if not match: + return url + + sheet_id = match.group(1) + + # Already a direct export link — leave it alone + if "export?format=csv" in url or "pub?output=csv" in url: + return url + + # Extract optional gid (sheet tab) from fragment or query string + gid_match = re.search(r"gid=(\d+)", url) + gid_suffix = f"&gid={gid_match.group(1)}" if gid_match else "" + + export_url = ( + f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv{gid_suffix}" + ) + logger.info(f"Converting Google Sheets URL to CSV export: {export_url}") + return export_url + + def load_df_from_source(source: str, is_url: bool = False) -> pd.DataFrame: """Load DataFrame from either URL or Hugging Face dataset.""" if is_url: diff --git a/agentic_security/probe_data/test_data.py b/agentic_security/probe_data/test_data.py index a4398f6..93f7e49 100644 --- a/agentic_security/probe_data/test_data.py +++ b/agentic_security/probe_data/test_data.py @@ -1,6 +1,32 @@ from inline_snapshot import snapshot -from .data import prepare_prompts +from .data import _normalize_google_sheets_url, prepare_prompts + + +class TestNormalizeGoogleSheetsUrl: + def test_passthrough_non_sheets_url(self): + url = "https://raw.githubusercontent.com/example/repo/main/data.csv" + assert _normalize_google_sheets_url(url) == url + + def test_edit_url_converted_to_export(self): + url = "https://docs.google.com/spreadsheets/d/ABC123/edit#gid=0" + result = _normalize_google_sheets_url(url) + assert "export?format=csv" in result + assert "ABC123" in result + assert "gid=0" in result + + def test_edit_url_no_gid(self): + url = "https://docs.google.com/spreadsheets/d/ABC123/edit" + result = _normalize_google_sheets_url(url) + assert result == "https://docs.google.com/spreadsheets/d/ABC123/export?format=csv" + + def test_already_export_url_unchanged(self): + url = "https://docs.google.com/spreadsheets/d/ABC123/export?format=csv" + assert _normalize_google_sheets_url(url) == url + + def test_pub_csv_url_unchanged(self): + url = "https://docs.google.com/spreadsheets/d/ABC123/pub?output=csv" + assert _normalize_google_sheets_url(url) == url class TestPreparePrompts: