mirror of
https://github.com/msoedov/agentic_security.git
synced 2026-06-23 21:59:57 +02:00
feat(datasets): support Google Sheets URLs in dataset loader
This commit is contained in:
@@ -31,12 +31,53 @@ TransformFn = Callable[[str], str]
|
||||
|
||||
# Core data loading utilities
|
||||
def fetch_csv_content(url: str) -> str:
|
||||
"""Fetch CSV content from a URL."""
|
||||
response = httpx.get(url)
|
||||
"""Fetch CSV content from a URL.
|
||||
|
||||
Handles Google Sheets share links by converting them to the CSV export URL.
|
||||
Accepts both the edit link format and the /pub?output=csv format.
|
||||
"""
|
||||
url = _normalize_google_sheets_url(url)
|
||||
response = httpx.get(url, follow_redirects=True)
|
||||
response.raise_for_status() # Raise exception for bad responses
|
||||
return response.content.decode("utf-8")
|
||||
|
||||
|
||||
def _normalize_google_sheets_url(url: str) -> str:
|
||||
"""Convert a Google Sheets share/edit URL to a CSV export URL if needed.
|
||||
|
||||
Supports the following formats:
|
||||
- https://docs.google.com/spreadsheets/d/<ID>/edit#gid=<GID>
|
||||
- https://docs.google.com/spreadsheets/d/<ID>/pub?output=csv (already correct)
|
||||
- https://docs.google.com/spreadsheets/d/<ID>/export?format=csv (already correct)
|
||||
|
||||
Returns the URL unchanged for non-Google-Sheets links.
|
||||
"""
|
||||
import re
|
||||
|
||||
match = re.match(
|
||||
r"https://docs\.google\.com/spreadsheets/d/([^/]+)(?:/[^?#]*)?(?:[?#].*)?$",
|
||||
url,
|
||||
)
|
||||
if not match:
|
||||
return url
|
||||
|
||||
sheet_id = match.group(1)
|
||||
|
||||
# Already a direct export link — leave it alone
|
||||
if "export?format=csv" in url or "pub?output=csv" in url:
|
||||
return url
|
||||
|
||||
# Extract optional gid (sheet tab) from fragment or query string
|
||||
gid_match = re.search(r"gid=(\d+)", url)
|
||||
gid_suffix = f"&gid={gid_match.group(1)}" if gid_match else ""
|
||||
|
||||
export_url = (
|
||||
f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv{gid_suffix}"
|
||||
)
|
||||
logger.info(f"Converting Google Sheets URL to CSV export: {export_url}")
|
||||
return export_url
|
||||
|
||||
|
||||
def load_df_from_source(source: str, is_url: bool = False) -> pd.DataFrame:
|
||||
"""Load DataFrame from either URL or Hugging Face dataset."""
|
||||
if is_url:
|
||||
|
||||
@@ -1,6 +1,32 @@
|
||||
from inline_snapshot import snapshot
|
||||
|
||||
from .data import prepare_prompts
|
||||
from .data import _normalize_google_sheets_url, prepare_prompts
|
||||
|
||||
|
||||
class TestNormalizeGoogleSheetsUrl:
|
||||
def test_passthrough_non_sheets_url(self):
|
||||
url = "https://raw.githubusercontent.com/example/repo/main/data.csv"
|
||||
assert _normalize_google_sheets_url(url) == url
|
||||
|
||||
def test_edit_url_converted_to_export(self):
|
||||
url = "https://docs.google.com/spreadsheets/d/ABC123/edit#gid=0"
|
||||
result = _normalize_google_sheets_url(url)
|
||||
assert "export?format=csv" in result
|
||||
assert "ABC123" in result
|
||||
assert "gid=0" in result
|
||||
|
||||
def test_edit_url_no_gid(self):
|
||||
url = "https://docs.google.com/spreadsheets/d/ABC123/edit"
|
||||
result = _normalize_google_sheets_url(url)
|
||||
assert result == "https://docs.google.com/spreadsheets/d/ABC123/export?format=csv"
|
||||
|
||||
def test_already_export_url_unchanged(self):
|
||||
url = "https://docs.google.com/spreadsheets/d/ABC123/export?format=csv"
|
||||
assert _normalize_google_sheets_url(url) == url
|
||||
|
||||
def test_pub_csv_url_unchanged(self):
|
||||
url = "https://docs.google.com/spreadsheets/d/ABC123/pub?output=csv"
|
||||
assert _normalize_google_sheets_url(url) == url
|
||||
|
||||
|
||||
class TestPreparePrompts:
|
||||
|
||||
Reference in New Issue
Block a user