Compare commits

...

10 Commits

Author SHA1 Message Date
github-actions[bot] 1db92d2ffd Add community submission from @ggman12 (closes #7) 2026-02-12 17:28:54 +00:00
ggman12 61aae586ee fix approve 2026-02-12 12:18:28 -05:00
ggman12 5abfa6b226 update submission validation 2026-02-12 12:15:04 -05:00
ggman12 a743b74ae5 Merge branch 'develop' 2026-02-12 12:10:24 -05:00
ggman12 53a020ab73 add jsonschema to requirements.txt 2026-02-12 12:09:03 -05:00
ggman12 2de41c9883 update historical. To check tar and fail fast if any maps fail 2026-02-12 12:01:13 -05:00
ggman12 bccc634158 remove existing release 2026-02-12 11:50:45 -05:00
ggman12 43b07942b0 add needed permissions 2026-02-12 11:42:49 -05:00
ggman12 2c9e994a12 add debug for FAA 2026-02-12 11:06:38 -05:00
ggman12 99b680476a delete parquet chunck after load to not use so much space for big historical run 2026-02-12 10:52:42 -05:00
9 changed files with 132 additions and 18 deletions
@@ -38,9 +38,10 @@ jobs:
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_REPOSITORY: ${{ github.repository }}
ISSUE_BODY: ${{ github.event.issue.body }}
run: | run: |
python -m src.contributions.approve_submission \ python -m src.contributions.approve_submission \
--issue-number ${{ github.event.issue.number }} \ --issue-number ${{ github.event.issue.number }} \
--issue-body "${{ github.event.issue.body }}" \ --issue-body "$ISSUE_BODY" \
--author "${{ steps.author.outputs.username }}" \ --author "${{ steps.author.outputs.username }}" \
--author-id ${{ steps.author.outputs.user_id }} --author-id ${{ steps.author.outputs.user_id }}
+29 -8
View File
@@ -81,8 +81,22 @@ jobs:
- name: Create tar of extracted data - name: Create tar of extracted data
run: | run: |
cd data/output cd data/output
tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "Some files may not exist" echo "=== Disk space before tar ==="
ls -lah extracted_data.tar || echo "No tar created" df -h .
echo "=== Files to tar ==="
ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
# Create tar with explicit error checking
if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
echo "=== Tar file created ==="
ls -lah extracted_data.tar
# Verify tar integrity
tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
else
echo "ERROR: No extracted directories found, cannot create tar"
exit 1
fi
- name: Upload extracted data - name: Upload extracted data
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
@@ -97,7 +111,7 @@ jobs:
needs: [generate-matrix, adsb-extract] needs: [generate-matrix, adsb-extract]
runs-on: ubuntu-24.04-arm runs-on: ubuntu-24.04-arm
strategy: strategy:
fail-fast: false fail-fast: true
matrix: matrix:
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }} chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
icao_chunk: [0, 1, 2, 3] icao_chunk: [0, 1, 2, 3]
@@ -134,7 +148,12 @@ jobs:
run: | run: |
cd data/output cd data/output
if [ -f extracted_data.tar ]; then if [ -f extracted_data.tar ]; then
tar -xf extracted_data.tar echo "=== Tar file info ==="
ls -lah extracted_data.tar
echo "=== Verifying tar integrity ==="
tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
echo "=== Extracting ==="
tar -xvf extracted_data.tar
rm extracted_data.tar rm extracted_data.tar
echo "has_data=true" >> "$GITHUB_OUTPUT" echo "has_data=true" >> "$GITHUB_OUTPUT"
echo "=== Contents of data/output ===" echo "=== Contents of data/output ==="
@@ -188,17 +207,19 @@ jobs:
- name: Debug downloaded files - name: Debug downloaded files
run: | run: |
echo "=== Disk space before processing ==="
df -h
echo "=== Listing data/output/adsb_chunks/ ===" echo "=== Listing data/output/adsb_chunks/ ==="
find data/output/adsb_chunks/ -type f 2>/dev/null | head -50 || echo "No files found" find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
echo "=== Looking for parquet files ===" echo "=== Total parquet size ==="
find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found" du -sh data/output/adsb_chunks/ || echo "No chunks dir"
- name: Combine chunks to CSV - name: Combine chunks to CSV
env: env:
START_DATE: ${{ needs.generate-matrix.outputs.global_start }} START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
END_DATE: ${{ needs.generate-matrix.outputs.global_end }} END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
run: | run: |
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
ls -lah data/planequery_aircraft/ ls -lah data/planequery_aircraft/
- name: Upload final artifact - name: Upload final artifact
@@ -277,6 +277,15 @@ jobs:
name: community-release name: community-release
path: artifacts/community path: artifacts/community
- name: Debug artifact structure
run: |
echo "=== FAA artifacts ==="
find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa"
echo "=== ADS-B artifacts ==="
find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb"
echo "=== Community artifacts ==="
find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community"
- name: Prepare release metadata - name: Prepare release metadata
id: meta id: meta
run: | run: |
@@ -312,6 +321,13 @@ jobs:
echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT" echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT" echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
- name: Delete existing release if exists
run: |
gh release delete "${{ steps.meta.outputs.tag }}" --yes 2>/dev/null || true
git push --delete origin "refs/tags/${{ steps.meta.outputs.tag }}" 2>/dev/null || true
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Create GitHub Release and upload assets - name: Create GitHub Release and upload assets
uses: softprops/action-gh-release@v2 uses: softprops/action-gh-release@v2
with: with:
@@ -4,6 +4,9 @@ on:
issues: issues:
types: [opened, edited] types: [opened, edited]
permissions:
issues: write
jobs: jobs:
validate: validate:
if: contains(github.event.issue.labels.*.name, 'submission') if: contains(github.event.issue.labels.*.name, 'submission')
@@ -20,11 +23,24 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: pip install jsonschema run: pip install jsonschema
- name: Debug issue body
run: |
echo "=== Issue Body ==="
cat << 'ISSUE_BODY_EOF'
${{ github.event.issue.body }}
ISSUE_BODY_EOF
- name: Save issue body to file
run: |
cat << 'ISSUE_BODY_EOF' > /tmp/issue_body.txt
${{ github.event.issue.body }}
ISSUE_BODY_EOF
- name: Validate submission - name: Validate submission
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_REPOSITORY: ${{ github.repository }}
run: | run: |
python -m src.contributions.validate_submission \ python -m src.contributions.validate_submission \
--issue-body "${{ github.event.issue.body }}" \ --issue-body-file /tmp/issue_body.txt \
--issue-number ${{ github.event.issue.number }} --issue-number ${{ github.event.issue.number }}
@@ -0,0 +1,11 @@
[
{
"contributor_name": "@ggman12",
"contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
"creation_timestamp": "2026-02-12T17:28:52.823922+00:00",
"registration_number": "N12345",
"tags": {
"internet": "starlink"
}
}
]
+1
View File
@@ -3,3 +3,4 @@ pandas==3.0.0
pyarrow==23.0.0 pyarrow==23.0.0
orjson==3.11.7 orjson==3.11.7
polars==1.38.1 polars==1.38.1
jsonschema==4.26.0
+18 -3
View File
@@ -36,8 +36,13 @@ def get_target_day() -> datetime:
return datetime.utcnow() - timedelta(days=1) return datetime.utcnow() - timedelta(days=1)
def process_single_chunk(chunk_path: str) -> pl.DataFrame: def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
"""Load and compress a single chunk parquet file.""" """Load and compress a single chunk parquet file.
Args:
chunk_path: Path to parquet file
delete_after_load: If True, delete the parquet file after loading to free disk space
"""
print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}") print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
# Load chunk - only columns we need # Load chunk - only columns we need
@@ -45,6 +50,14 @@ def process_single_chunk(chunk_path: str) -> pl.DataFrame:
df = pl.read_parquet(chunk_path, columns=needed_columns) df = pl.read_parquet(chunk_path, columns=needed_columns)
print(f" Loaded {len(df)} rows") print(f" Loaded {len(df)} rows")
# Delete file immediately after loading to free disk space
if delete_after_load:
try:
os.remove(chunk_path)
print(f" Deleted {chunk_path} to free disk space")
except Exception as e:
print(f" Warning: Failed to delete {chunk_path}: {e}")
# Compress to aircraft records (one per ICAO) using shared function # Compress to aircraft records (one per ICAO) using shared function
compressed = compress_multi_icao_df(df, verbose=True) compressed = compress_multi_icao_df(df, verbose=True)
print(f" Compressed to {len(compressed)} aircraft records") print(f" Compressed to {len(compressed)} aircraft records")
@@ -156,6 +169,7 @@ def main():
parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files") parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release") parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging") parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
args = parser.parse_args() args = parser.parse_args()
# Determine output ID and filename based on mode # Determine output ID and filename based on mode
@@ -190,9 +204,10 @@ def main():
print(f"Found {len(chunk_files)} chunk files") print(f"Found {len(chunk_files)} chunk files")
# Process each chunk separately to save memory # Process each chunk separately to save memory
# With --stream, delete parquet files immediately after loading to save disk space
compressed_chunks = [] compressed_chunks = []
for chunk_path in chunk_files: for chunk_path in chunk_files:
compressed = process_single_chunk(chunk_path) compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
compressed_chunks.append(compressed) compressed_chunks.append(compressed)
gc.collect() gc.collect()
+22 -5
View File
@@ -54,7 +54,9 @@ def extract_json_from_issue_body(body: str) -> str | None:
""" """
Extract JSON from GitHub issue body. Extract JSON from GitHub issue body.
Looks for JSON in the 'Submission JSON' section wrapped in code blocks. Looks for JSON in the 'Submission JSON' section, either:
- Wrapped in code blocks (```json ... ``` or ``` ... ```)
- Or raw JSON after the header
Args: Args:
body: The issue body text body: The issue body text
@@ -62,13 +64,28 @@ def extract_json_from_issue_body(body: str) -> str | None:
Returns: Returns:
Extracted JSON string or None if not found Extracted JSON string or None if not found
""" """
# Match JSON in "### Submission JSON" section # Try: JSON in code blocks after "### Submission JSON"
pattern = r"### Submission JSON\s*\n\s*```(?:json)?\s*\n([\s\S]*?)\n\s*```" pattern_codeblock = r"### Submission JSON\s*\n\s*```(?:json)?\s*\n([\s\S]*?)\n\s*```"
match = re.search(pattern, body) match = re.search(pattern_codeblock, body)
if match: if match:
return match.group(1).strip() return match.group(1).strip()
# Try: Raw JSON after "### Submission JSON" until next section or end
pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*?[\]}])(?=\n###|\n\n###|$)"
match = re.search(pattern_raw, body)
if match:
return match.group(1).strip()
# Try: Any JSON object/array in the body (fallback)
pattern_any = r"([\[{][\s\S]*?[\]}])"
for match in re.finditer(pattern_any, body):
candidate = match.group(1).strip()
# Validate it looks like JSON
if candidate.startswith('{') and candidate.endswith('}'):
return candidate
if candidate.startswith('[') and candidate.endswith(']'):
return candidate
return None return None
+16
View File
@@ -7,6 +7,7 @@ submissions when issues are opened or edited.
Usage: Usage:
python -m src.contributions.validate_submission --issue-body "..." python -m src.contributions.validate_submission --issue-body "..."
python -m src.contributions.validate_submission --issue-body-file /path/to/body.txt
python -m src.contributions.validate_submission --file submission.json python -m src.contributions.validate_submission --file submission.json
echo '{"registration_number": "N12345"}' | python -m src.contributions.validate_submission --stdin echo '{"registration_number": "N12345"}' | python -m src.contributions.validate_submission --stdin
@@ -106,6 +107,7 @@ def main():
parser = argparse.ArgumentParser(description="Validate community submission JSON") parser = argparse.ArgumentParser(description="Validate community submission JSON")
source_group = parser.add_mutually_exclusive_group(required=True) source_group = parser.add_mutually_exclusive_group(required=True)
source_group.add_argument("--issue-body", help="Issue body text containing JSON") source_group.add_argument("--issue-body", help="Issue body text containing JSON")
source_group.add_argument("--issue-body-file", help="File containing issue body text")
source_group.add_argument("--file", help="JSON file to validate") source_group.add_argument("--file", help="JSON file to validate")
source_group.add_argument("--stdin", action="store_true", help="Read JSON from stdin") source_group.add_argument("--stdin", action="store_true", help="Read JSON from stdin")
@@ -125,6 +127,20 @@ def main():
"Please ensure your JSON is in the 'Submission JSON' field wrapped in code blocks." "Please ensure your JSON is in the 'Submission JSON' field wrapped in code blocks."
) )
sys.exit(1) sys.exit(1)
elif args.issue_body_file:
with open(args.issue_body_file) as f:
issue_body = f.read()
json_str = extract_json_from_issue_body(issue_body)
if not json_str:
print("❌ Could not extract JSON from issue body", file=sys.stderr)
print(f"Issue body:\n{issue_body}", file=sys.stderr)
if args.issue_number:
add_issue_comment(
args.issue_number,
"❌ **Validation Failed**\n\nCould not extract JSON from submission. "
"Please ensure your JSON is in the 'Submission JSON' field."
)
sys.exit(1)
elif args.file: elif args.file:
with open(args.file) as f: with open(args.file) as f:
json_str = f.read() json_str = f.read()