try less strict tar extract for 2025-10-15 and other days that fail

src/contributions/approve_submission.py handle big json files
update download.sh
2026-05-03 16:25:08 +02:00 · 2026-02-19 00:20:03 -05:00 · 2026-02-18 23:18:19 -05:00 · 2026-02-18 23:18:19 -05:00 · 2026-02-18 23:18:19 -05:00 · 2026-02-18 17:18:49 -05:00
4 changed files with 81 additions and 15 deletions
@@ -23,6 +23,12 @@ gh run list \
    "repos/$REPO/actions/runs/$run_id/artifacts" \
    --jq '.artifacts[] | select(.name | test("^openairframes_adsb-[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{4}-[0-9]{2}-[0-9]{2}$")) | .name' | while read -r artifact_name; do
    
+    # Check if artifact directory already exists and has files
+    if [ -d "downloads/adsb_artifacts/$artifact_name" ] && [ -n "$(ls -A "downloads/adsb_artifacts/$artifact_name" 2>/dev/null)" ]; then
+      echo "  Skipping (already exists): $artifact_name"
+      continue
+    fi
+    
    echo "  Downloading: $artifact_name"
    gh run download "$run_id" \
      --repo "$REPO" \
@@ -129,13 +129,32 @@ def fetch_releases(version_date: str) -> list:
    return releases


-def download_asset(asset_url: str, file_path: str) -> bool:
-    """Download a single release asset."""
+def download_asset(asset_url: str, file_path: str, expected_size: int | None = None) -> bool:
+    """Download a single release asset with size verification.
+    
+    Args:
+        asset_url: URL to download from
+        file_path: Local path to save to
+        expected_size: Expected file size in bytes (for verification)
+    
+    Returns:
+        True if download succeeded and size matches (if provided), False otherwise
+    """
    os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True)
    
+    # Check if file exists and has correct size
    if os.path.exists(file_path):
-        print(f"[SKIP] {file_path} already downloaded.")
-        return True
+        if expected_size is not None:
+            actual_size = os.path.getsize(file_path)
+            if actual_size == expected_size:
+                print(f"[SKIP] {file_path} already downloaded and verified ({actual_size} bytes).")
+                return True
+            else:
+                print(f"[WARN] {file_path} exists but size mismatch (expected {expected_size}, got {actual_size}). Re-downloading.")
+                os.remove(file_path)
+        else:
+            print(f"[SKIP] {file_path} already downloaded.")
+            return True
    
    max_retries = 2
    retry_delay = 30
@@ -153,7 +172,21 @@ def download_asset(asset_url: str, file_path: str) -> bool:
                            if not chunk:
                                break
                            file.write(chunk)
-                    print(f"Saved {file_path}")
+                    
+                    # Verify file size if expected_size was provided
+                    if expected_size is not None:
+                        actual_size = os.path.getsize(file_path)
+                        if actual_size != expected_size:
+                            print(f"[ERROR] Size mismatch for {file_path}: expected {expected_size} bytes, got {actual_size} bytes")
+                            os.remove(file_path)
+                            if attempt < max_retries:
+                                print(f"Waiting {retry_delay} seconds before retry")
+                                time.sleep(retry_delay)
+                                continue
+                            return False
+                        print(f"Saved {file_path} ({actual_size} bytes, verified)")
+                    else:
+                        print(f"Saved {file_path}")
                    return True
                else:
                    print(f"Failed to download {asset_url}: {response.status} {response.msg}")
@@ -227,7 +260,6 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
            stdin=cat_proc.stdout,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
-            check=True
        )
        cat_proc.stdout.close()
        cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
@@ -236,6 +268,24 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
        if cat_stderr:
            print(f"cat stderr: {cat_stderr}")
        
+        tar_stderr = result.stderr.decode() if result.stderr else ""
+        if result.returncode != 0:
+            # GNU tar exits non-zero for format issues that BSD tar silently
+            # tolerates (e.g. trailing junk after the last valid entry).
+            # Check whether files were actually extracted before giving up.
+            extracted_items = os.listdir(extract_dir)
+            if extracted_items:
+                print(f"[WARN] tar exited {result.returncode} but extracted "
+                      f"{len(extracted_items)} items — treating as success")
+                if tar_stderr:
+                    print(f"tar stderr: {tar_stderr}")
+            else:
+                print(f"Failed to extract split archive (tar exit {result.returncode})")
+                if tar_stderr:
+                    print(f"tar stderr: {tar_stderr}")
+                shutil.rmtree(extract_dir, ignore_errors=True)
+                return False
+        
        print(f"Successfully extracted archive to {extract_dir}")
        
        # Delete tar files immediately after extraction
@@ -252,11 +302,9 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
        print(f"Disk space after tar deletion: {free_gb:.1f}GB free")
        
        return True
-    except subprocess.CalledProcessError as e:
-        stderr_output = e.stderr.decode() if e.stderr else ""
+    except Exception as e:
        print(f"Failed to extract split archive: {e}")
-        if stderr_output:
-            print(f"tar stderr: {stderr_output}")
+        shutil.rmtree(extract_dir, ignore_errors=True)
        return False


@@ -77,8 +77,9 @@ def download_and_extract(version_date: str) -> str | None:
            for asset in use_assets:
                asset_name = asset["name"]
                asset_url = asset["browser_download_url"]
+                asset_size = asset.get("size")  # Get expected file size
                file_path = os.path.join(OUTPUT_DIR, asset_name)
-                if download_asset(asset_url, file_path):
+                if download_asset(asset_url, file_path, expected_size=asset_size):
                    downloaded_files.append(file_path)
    
    if not downloaded_files:
@@ -246,6 +246,20 @@ def process_submission(
    if schema_updated:
        schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
    
+    # Truncate JSON preview to stay under GitHub's 65536 char body limit
+    max_json_preview = 50000
+    if len(content_json) > max_json_preview:
+        # Show first few entries as a preview
+        preview_entries = submissions[:10]
+        preview_json = json.dumps(preview_entries, indent=2, sort_keys=True)
+        json_section = (
+            f"### Submissions (showing 10 of {len(submissions)})\n"
+            f"```json\n{preview_json}\n```\n\n"
+            f"*Full submission ({len(submissions)} entries, {len(content_json):,} chars) is in the committed file.*"
+        )
+    else:
+        json_section = f"### Submissions\n```json\n{content_json}\n```"
+
    pr_body = f"""## Community Submission

 Adds {len(submissions)} submission(s) from @{author_username}.
@@ -257,10 +271,7 @@ Closes #{issue_number}

 ---

-### Submissions
-```json
-{content_json}
-```"""
+{json_section}"""
    
    pr = create_pull_request(
        title=f"Community submission: {filename}",
Author	SHA1	Message	Date
ggman12	82d11d8d24	try less strict tar extract for 2025-10-15 and other days that fail	2026-02-19 00:20:03 -05:00
ggman12	76a217ad14	src/contributions/approve_submission.py handle big json files	2026-02-18 23:18:19 -05:00
ggman12	ec2d1a1291	update download.sh	2026-02-18 23:18:19 -05:00
ggman12	97284c69a9	verify downlaod asssets	2026-02-18 23:18:19 -05:00
JG	892ffa78af	Merge pull request #28 from PlaneQuery/community-submission-27 Community submission: ggman12_2026-02-18_5ddbb8bd.json	2026-02-18 17:18:49 -05:00