feat(datasets): support Google Sheets URLs in dataset loader

This commit is contained in:
ykd007
2026-05-14 15:02:24 +05:30
parent 5b90eb032a
commit b4a5a5dc5a
2 changed files with 70 additions and 3 deletions
+43 -2
View File
@@ -31,12 +31,53 @@ TransformFn = Callable[[str], str]
# Core data loading utilities
def fetch_csv_content(url: str) -> str:
"""Fetch CSV content from a URL."""
response = httpx.get(url)
"""Fetch CSV content from a URL.
Handles Google Sheets share links by converting them to the CSV export URL.
Accepts both the edit link format and the /pub?output=csv format.
"""
url = _normalize_google_sheets_url(url)
response = httpx.get(url, follow_redirects=True)
response.raise_for_status() # Raise exception for bad responses
return response.content.decode("utf-8")
def _normalize_google_sheets_url(url: str) -> str:
"""Convert a Google Sheets share/edit URL to a CSV export URL if needed.
Supports the following formats:
- https://docs.google.com/spreadsheets/d/<ID>/edit#gid=<GID>
- https://docs.google.com/spreadsheets/d/<ID>/pub?output=csv (already correct)
- https://docs.google.com/spreadsheets/d/<ID>/export?format=csv (already correct)
Returns the URL unchanged for non-Google-Sheets links.
"""
import re
match = re.match(
r"https://docs\.google\.com/spreadsheets/d/([^/]+)(?:/[^?#]*)?(?:[?#].*)?$",
url,
)
if not match:
return url
sheet_id = match.group(1)
# Already a direct export link — leave it alone
if "export?format=csv" in url or "pub?output=csv" in url:
return url
# Extract optional gid (sheet tab) from fragment or query string
gid_match = re.search(r"gid=(\d+)", url)
gid_suffix = f"&gid={gid_match.group(1)}" if gid_match else ""
export_url = (
f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv{gid_suffix}"
)
logger.info(f"Converting Google Sheets URL to CSV export: {export_url}")
return export_url
def load_df_from_source(source: str, is_url: bool = False) -> pd.DataFrame:
"""Load DataFrame from either URL or Hugging Face dataset."""
if is_url:
+27 -1
View File
@@ -1,6 +1,32 @@
from inline_snapshot import snapshot
from .data import prepare_prompts
from .data import _normalize_google_sheets_url, prepare_prompts
class TestNormalizeGoogleSheetsUrl:
def test_passthrough_non_sheets_url(self):
url = "https://raw.githubusercontent.com/example/repo/main/data.csv"
assert _normalize_google_sheets_url(url) == url
def test_edit_url_converted_to_export(self):
url = "https://docs.google.com/spreadsheets/d/ABC123/edit#gid=0"
result = _normalize_google_sheets_url(url)
assert "export?format=csv" in result
assert "ABC123" in result
assert "gid=0" in result
def test_edit_url_no_gid(self):
url = "https://docs.google.com/spreadsheets/d/ABC123/edit"
result = _normalize_google_sheets_url(url)
assert result == "https://docs.google.com/spreadsheets/d/ABC123/export?format=csv"
def test_already_export_url_unchanged(self):
url = "https://docs.google.com/spreadsheets/d/ABC123/export?format=csv"
assert _normalize_google_sheets_url(url) == url
def test_pub_csv_url_unchanged(self):
url = "https://docs.google.com/spreadsheets/d/ABC123/pub?output=csv"
assert _normalize_google_sheets_url(url) == url
class TestPreparePrompts: