Merge pull request #290 from ykd007/feat/google-sheets-dataset-support

feat(datasets): support Google Sheets URLs in dataset loader
2026-07-02 01:55:30 +02:00 · 2026-05-14 19:20:49 +03:00
parent 669169bd11 dc24d91250
commit e38365c904
2 changed files with 69 additions and 3 deletions
@@ -1,6 +1,7 @@
 import io
 import os
 import random
+import re
 from collections.abc import Callable, Iterator
 from functools import partial
 from typing import Any, TypeVar
@@ -31,12 +32,49 @@ TransformFn = Callable[[str], str]

 # Core data loading utilities
 def fetch_csv_content(url: str) -> str:
-    """Fetch CSV content from a URL."""
-    response = httpx.get(url)
+    """Fetch CSV content from a URL.
+
+    Handles Google Sheets share links by converting them to the CSV export URL.
+    Accepts both the edit link format and the /pub?output=csv format.
+    """
+    url = _normalize_google_sheets_url(url)
+    response = httpx.get(url, follow_redirects=True)
    response.raise_for_status()  # Raise exception for bad responses
    return response.content.decode("utf-8")


+def _normalize_google_sheets_url(url: str) -> str:
+    """Convert a Google Sheets share/edit URL to a CSV export URL if needed.
+
+    Supports the following formats:
+    - https://docs.google.com/spreadsheets/d/<ID>/edit#gid=<GID>
+    - https://docs.google.com/spreadsheets/d/<ID>/pub?output=csv  (already correct)
+    - https://docs.google.com/spreadsheets/d/<ID>/export?format=csv  (already correct)
+
+    Returns the URL unchanged for non-Google-Sheets links.
+    """
+    match = re.match(
+        r"https://docs\.google\.com/spreadsheets/d/([^/]+)(?:/[^?#]*)?(?:[?#].*)?$",
+        url,
+    )
+    if not match:
+        return url
+
+    sheet_id = match.group(1)
+
+    # Already a direct export link — leave it alone
+    if "export?format=csv" in url or "pub?output=csv" in url:
+        return url
+
+    # Extract optional gid (sheet tab) from fragment or query string
+    gid_match = re.search(r"gid=(\d+)", url)
+    gid_suffix = f"&gid={gid_match.group(1)}" if gid_match else ""
+
+    export_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv{gid_suffix}"
+    logger.info(f"Converting Google Sheets URL to CSV export: {export_url}")
+    return export_url
+
+
 def load_df_from_source(source: str, is_url: bool = False) -> pd.DataFrame:
    """Load DataFrame from either URL or Hugging Face dataset."""
    if is_url:
@@ -1,6 +1,34 @@
 from inline_snapshot import snapshot

-from .data import prepare_prompts
+from .data import _normalize_google_sheets_url, prepare_prompts
+
+
+class TestNormalizeGoogleSheetsUrl:
+    def test_passthrough_non_sheets_url(self):
+        url = "https://raw.githubusercontent.com/example/repo/main/data.csv"
+        assert _normalize_google_sheets_url(url) == url
+
+    def test_edit_url_converted_to_export(self):
+        url = "https://docs.google.com/spreadsheets/d/ABC123/edit#gid=0"
+        result = _normalize_google_sheets_url(url)
+        assert "export?format=csv" in result
+        assert "ABC123" in result
+        assert "gid=0" in result
+
+    def test_edit_url_no_gid(self):
+        url = "https://docs.google.com/spreadsheets/d/ABC123/edit"
+        result = _normalize_google_sheets_url(url)
+        assert (
+            result == "https://docs.google.com/spreadsheets/d/ABC123/export?format=csv"
+        )
+
+    def test_already_export_url_unchanged(self):
+        url = "https://docs.google.com/spreadsheets/d/ABC123/export?format=csv"
+        assert _normalize_google_sheets_url(url) == url
+
+    def test_pub_csv_url_unchanged(self):
+        url = "https://docs.google.com/spreadsheets/d/ABC123/pub?output=csv"
+        assert _normalize_google_sheets_url(url) == url


 class TestPreparePrompts: