FEATURE: Add contributions framework. Fix and improve daily adsb release using Github actions for map reduce.

This commit is contained in:
ggman12
2026-02-11 14:04:27 -05:00
parent 27da93801e
commit 722bcdf791
29 changed files with 2347 additions and 343 deletions
+1
View File
@@ -0,0 +1 @@
"""Community contributions processing module."""
+249
View File
@@ -0,0 +1,249 @@
#!/usr/bin/env python3
"""
Approve a community submission and create a PR.
This script is called by the GitHub Actions workflow when the 'approved'
label is added to a validated submission issue.
Usage:
python -m src.contributions.approve_submission --issue-number 123 --issue-body "..." --author "username" --author-id 12345
Environment variables:
GITHUB_TOKEN: GitHub API token with repo write permissions
GITHUB_REPOSITORY: owner/repo
"""
import argparse
import base64
import json
import os
import sys
import urllib.request
import urllib.error
from datetime import datetime, timezone
from .schema import extract_json_from_issue_body, extract_contributor_name_from_issue_body, parse_and_validate
from .contributor import (
generate_contributor_uuid,
generate_submission_filename,
compute_content_hash,
)
def github_api_request(
method: str,
endpoint: str,
data: dict | None = None,
accept: str = "application/vnd.github.v3+json"
) -> dict:
"""Make a GitHub API request."""
token = os.environ.get("GITHUB_TOKEN")
repo = os.environ.get("GITHUB_REPOSITORY")
if not token or not repo:
raise EnvironmentError("GITHUB_TOKEN and GITHUB_REPOSITORY must be set")
url = f"https://api.github.com/repos/{repo}{endpoint}"
headers = {
"Authorization": f"token {token}",
"Accept": accept,
"Content-Type": "application/json",
}
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, headers=headers, method=method)
try:
with urllib.request.urlopen(req) as response:
return json.loads(response.read())
except urllib.error.HTTPError as e:
error_body = e.read().decode() if e.fp else ""
print(f"GitHub API error: {e.code} {e.reason}: {error_body}", file=sys.stderr)
raise
def add_issue_comment(issue_number: int, body: str) -> None:
"""Add a comment to a GitHub issue."""
github_api_request("POST", f"/issues/{issue_number}/comments", {"body": body})
def get_default_branch_sha() -> str:
"""Get the SHA of the default branch (main)."""
ref = github_api_request("GET", "/git/ref/heads/main")
return ref["object"]["sha"]
def create_branch(branch_name: str, sha: str) -> None:
"""Create a new branch from a SHA."""
try:
github_api_request("POST", "/git/refs", {
"ref": f"refs/heads/{branch_name}",
"sha": sha,
})
except urllib.error.HTTPError as e:
if e.code == 422: # Branch exists
# Delete and recreate
try:
github_api_request("DELETE", f"/git/refs/heads/{branch_name}")
except urllib.error.HTTPError:
pass
github_api_request("POST", "/git/refs", {
"ref": f"refs/heads/{branch_name}",
"sha": sha,
})
else:
raise
def create_or_update_file(path: str, content: str, message: str, branch: str) -> None:
"""Create or update a file in the repository."""
content_b64 = base64.b64encode(content.encode()).decode()
github_api_request("PUT", f"/contents/{path}", {
"message": message,
"content": content_b64,
"branch": branch,
})
def create_pull_request(title: str, head: str, base: str, body: str) -> dict:
"""Create a pull request."""
return github_api_request("POST", "/pulls", {
"title": title,
"head": head,
"base": base,
"body": body,
})
def add_labels_to_issue(issue_number: int, labels: list[str]) -> None:
"""Add labels to an issue or PR."""
github_api_request("POST", f"/issues/{issue_number}/labels", {"labels": labels})
def process_submission(
issue_number: int,
issue_body: str,
author_username: str,
author_id: int,
) -> bool:
"""
Process an approved submission and create a PR.
Args:
issue_number: The GitHub issue number
issue_body: The issue body text
author_username: The GitHub username of the issue author
author_id: The numeric GitHub user ID
Returns:
True if successful, False otherwise
"""
# Extract and validate JSON
json_str = extract_json_from_issue_body(issue_body)
if not json_str:
add_issue_comment(issue_number, "❌ Could not extract JSON from submission.")
return False
data, errors = parse_and_validate(json_str)
if errors:
error_list = "\n".join(f"- {e}" for e in errors)
add_issue_comment(issue_number, f"❌ **Validation Failed**\n\n{error_list}")
return False
# Normalize to list
submissions = data if isinstance(data, list) else [data]
# Generate contributor UUID from GitHub ID
contributor_uuid = generate_contributor_uuid(author_id)
# Extract contributor name from issue form (or default to GitHub username)
contributor_name = extract_contributor_name_from_issue_body(issue_body)
if not contributor_name:
contributor_name = f"@{author_username}"
# Add metadata to each submission
now = datetime.now(timezone.utc)
date_str = now.strftime("%Y-%m-%d")
timestamp_str = now.isoformat()
for submission in submissions:
submission["contributor_uuid"] = contributor_uuid
submission["contributor_name"] = contributor_name
submission["creation_timestamp"] = timestamp_str
# Generate unique filename
content_json = json.dumps(submissions, indent=2, sort_keys=True)
content_hash = compute_content_hash(content_json)
filename = generate_submission_filename(author_username, date_str, content_hash)
file_path = f"community/{filename}"
# Create branch
branch_name = f"community-submission-{issue_number}"
default_sha = get_default_branch_sha()
create_branch(branch_name, default_sha)
# Create file
commit_message = f"Add community submission from @{author_username} (closes #{issue_number})"
create_or_update_file(file_path, content_json, commit_message, branch_name)
# Create PR
pr_body = f"""## Community Submission
Adds {len(submissions)} submission(s) from @{author_username}.
**File:** `{file_path}`
**Contributor UUID:** `{contributor_uuid}`
Closes #{issue_number}
---
### Submissions
```json
{content_json}
```"""
pr = create_pull_request(
title=f"Community submission: {filename}",
head=branch_name,
base="main",
body=pr_body,
)
# Add labels to PR
add_labels_to_issue(pr["number"], ["community", "auto-generated"])
# Comment on original issue
add_issue_comment(
issue_number,
f"✅ **Submission Approved**\n\n"
f"PR #{pr['number']} has been created to add your submission.\n\n"
f"**File:** `{file_path}`\n"
f"**Your Contributor UUID:** `{contributor_uuid}`\n\n"
f"The PR will be merged by a maintainer."
)
print(f"Created PR #{pr['number']} for submission")
return True
def main():
parser = argparse.ArgumentParser(description="Approve community submission and create PR")
parser.add_argument("--issue-number", type=int, required=True, help="GitHub issue number")
parser.add_argument("--issue-body", required=True, help="Issue body text")
parser.add_argument("--author", required=True, help="Issue author username")
parser.add_argument("--author-id", type=int, required=True, help="Issue author numeric ID")
args = parser.parse_args()
success = process_submission(
issue_number=args.issue_number,
issue_body=args.issue_body,
author_username=args.author,
author_id=args.author_id,
)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()
+86
View File
@@ -0,0 +1,86 @@
"""Contributor identification utilities."""
import hashlib
import uuid
# DNS namespace UUID for generating UUIDv5
DNS_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
def generate_contributor_uuid(github_user_id: int) -> str:
"""
Generate a deterministic UUID v5 from a GitHub user ID.
This ensures the same GitHub account always gets the same contributor UUID.
Args:
github_user_id: The numeric GitHub user ID
Returns:
UUID string in standard format
"""
name = f"github:{github_user_id}"
return str(uuid.uuid5(DNS_NAMESPACE, name))
def sanitize_username(username: str, max_length: int = 20) -> str:
"""
Sanitize a GitHub username for use in filenames.
Args:
username: GitHub username
max_length: Maximum length of sanitized name
Returns:
Lowercase alphanumeric string with underscores
"""
sanitized = ""
for char in username.lower():
if char.isalnum():
sanitized += char
else:
sanitized += "_"
# Collapse multiple underscores
while "__" in sanitized:
sanitized = sanitized.replace("__", "_")
return sanitized.strip("_")[:max_length]
def generate_submission_filename(
username: str,
date_str: str,
content_hash: str,
extension: str = ".json"
) -> str:
"""
Generate a unique filename for a community submission.
Format: {sanitized_username}_{date}_{short_hash}.json
Args:
username: GitHub username
date_str: Date in YYYY-MM-DD format
content_hash: Hash of the submission content (will be truncated to 8 chars)
extension: File extension (default: .json)
Returns:
Unique filename string
"""
sanitized_name = sanitize_username(username)
short_hash = content_hash[:8]
return f"{sanitized_name}_{date_str}_{short_hash}{extension}"
def compute_content_hash(content: str) -> str:
"""
Compute SHA256 hash of content.
Args:
content: String content to hash
Returns:
Hex digest of SHA256 hash
"""
return hashlib.sha256(content.encode()).hexdigest()
@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Generate a daily CSV of all community contributions.
Reads all JSON files from the community/ directory and outputs a sorted CSV
with creation_timestamp as the first column and contributor_name/contributor_uuid as the last columns.
Usage:
python -m src.contributions.create_daily_community_release
"""
from datetime import datetime, timezone
from pathlib import Path
import json
import sys
import pandas as pd
COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community"
OUT_ROOT = Path("data/planequery_aircraft")
def read_all_submissions(community_dir: Path) -> list[dict]:
"""Read all JSON submissions from the community directory."""
all_submissions = []
for json_file in sorted(community_dir.glob("*.json")):
try:
with open(json_file) as f:
data = json.load(f)
# Normalize to list
submissions = data if isinstance(data, list) else [data]
all_submissions.extend(submissions)
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: Failed to read {json_file}: {e}", file=sys.stderr)
return all_submissions
def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
"""
Convert submissions to a DataFrame with proper column ordering.
Column order:
- creation_timestamp (first)
- transponder_code_hex
- registration_number
- planequery_airframe_id
- contributor_name
- [other columns alphabetically]
- contributor_uuid (last)
"""
if not submissions:
return pd.DataFrame()
df = pd.DataFrame(submissions)
# Ensure required columns exist
required_cols = [
"creation_timestamp",
"transponder_code_hex",
"registration_number",
"planequery_airframe_id",
"contributor_name",
"contributor_uuid",
]
for col in required_cols:
if col not in df.columns:
df[col] = None
# Sort by creation_timestamp ascending
df = df.sort_values("creation_timestamp", ascending=True, na_position="last")
# Reorder columns: specific order first, contributor_uuid last
first_cols = [
"creation_timestamp",
"transponder_code_hex",
"registration_number",
"planequery_airframe_id",
"contributor_name",
]
last_cols = ["contributor_uuid"]
middle_cols = sorted([
col for col in df.columns
if col not in first_cols and col not in last_cols
])
ordered_cols = first_cols + middle_cols + last_cols
df = df[ordered_cols]
return df.reset_index(drop=True)
def main():
"""Generate the daily community contributions CSV."""
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
print(f"Reading community submissions from {COMMUNITY_DIR}")
submissions = read_all_submissions(COMMUNITY_DIR)
if not submissions:
print("No community submissions found.")
# Still create an empty CSV with headers
df = pd.DataFrame(columns=[
"creation_timestamp",
"transponder_code_hex",
"registration_number",
"planequery_airframe_id",
"contributor_name",
"tags",
"contributor_uuid",
])
else:
print(f"Found {len(submissions)} total submissions")
df = submissions_to_dataframe(submissions)
# Determine date range for filename
if not df.empty and df["creation_timestamp"].notna().any():
# Get earliest timestamp for start date
earliest = pd.to_datetime(df["creation_timestamp"]).min()
start_date_str = earliest.strftime("%Y-%m-%d")
else:
start_date_str = date_str
# Output
OUT_ROOT.mkdir(parents=True, exist_ok=True)
output_file = OUT_ROOT / f"planequery_aircraft_community_{start_date_str}_{date_str}.csv"
df.to_csv(output_file, index=False)
print(f"Saved: {output_file}")
print(f"Total contributions: {len(df)}")
return output_file
if __name__ == "__main__":
main()
+115
View File
@@ -0,0 +1,115 @@
#!/usr/bin/env python3
"""
Read and aggregate all community submission data.
Usage:
python -m src.contributions.read_community_data
python -m src.contributions.read_community_data --output merged.json
"""
import argparse
import json
import sys
from pathlib import Path
COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community"
def read_all_submissions(community_dir: Path | None = None) -> list[dict]:
"""
Read all JSON submissions from the community directory.
Args:
community_dir: Path to community directory. Uses default if None.
Returns:
List of all submission dictionaries
"""
if community_dir is None:
community_dir = COMMUNITY_DIR
all_submissions = []
for json_file in sorted(community_dir.glob("*.json")):
try:
with open(json_file) as f:
data = json.load(f)
# Normalize to list
submissions = data if isinstance(data, list) else [data]
# Add source file metadata
for submission in submissions:
submission["_source_file"] = json_file.name
all_submissions.extend(submissions)
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: Failed to read {json_file}: {e}", file=sys.stderr)
return all_submissions
def group_by_identifier(submissions: list[dict]) -> dict[str, list[dict]]:
"""
Group submissions by their identifier (registration, transponder, or airframe ID).
Returns:
Dict mapping identifier to list of submissions for that identifier
"""
grouped = {}
for submission in submissions:
# Determine identifier
if "registration_number" in submission:
key = f"reg:{submission['registration_number']}"
elif "transponder_code_hex" in submission:
key = f"icao:{submission['transponder_code_hex']}"
elif "planequery_airframe_id" in submission:
key = f"id:{submission['planequery_airframe_id']}"
else:
key = "_unknown"
if key not in grouped:
grouped[key] = []
grouped[key].append(submission)
return grouped
def main():
parser = argparse.ArgumentParser(description="Read community submission data")
parser.add_argument("--output", "-o", help="Output file (default: stdout)")
parser.add_argument("--group", action="store_true", help="Group by identifier")
parser.add_argument("--stats", action="store_true", help="Print statistics only")
args = parser.parse_args()
submissions = read_all_submissions()
if args.stats:
grouped = group_by_identifier(submissions)
contributors = set(s.get("contributor_uuid", "unknown") for s in submissions)
print(f"Total submissions: {len(submissions)}")
print(f"Unique identifiers: {len(grouped)}")
print(f"Unique contributors: {len(contributors)}")
return
if args.group:
result = group_by_identifier(submissions)
else:
result = submissions
output = json.dumps(result, indent=2)
if args.output:
with open(args.output, "w") as f:
f.write(output)
print(f"Wrote {len(submissions)} submissions to {args.output}")
else:
print(output)
if __name__ == "__main__":
main()
+117
View File
@@ -0,0 +1,117 @@
"""Schema validation for community submissions."""
import json
import re
from pathlib import Path
from typing import Any
try:
from jsonschema import Draft202012Validator
except ImportError:
Draft202012Validator = None
SCHEMA_PATH = Path(__file__).parent.parent.parent / "schemas" / "community_submission.v1.schema.json"
def load_schema() -> dict:
"""Load the community submission schema."""
with open(SCHEMA_PATH) as f:
return json.load(f)
def validate_submission(data: dict | list, schema: dict | None = None) -> list[str]:
"""
Validate submission(s) against schema.
Args:
data: Single submission dict or list of submissions
schema: Optional schema dict. If None, loads from default path.
Returns:
List of error messages. Empty list means validation passed.
"""
if Draft202012Validator is None:
raise ImportError("jsonschema is required: pip install jsonschema")
if schema is None:
schema = load_schema()
submissions = data if isinstance(data, list) else [data]
errors = []
validator = Draft202012Validator(schema)
for i, submission in enumerate(submissions):
prefix = f"[{i}] " if len(submissions) > 1 else ""
for error in validator.iter_errors(submission):
path = ".".join(str(p) for p in error.path) if error.path else "(root)"
errors.append(f"{prefix}{path}: {error.message}")
return errors
def extract_json_from_issue_body(body: str) -> str | None:
"""
Extract JSON from GitHub issue body.
Looks for JSON in the 'Submission JSON' section wrapped in code blocks.
Args:
body: The issue body text
Returns:
Extracted JSON string or None if not found
"""
# Match JSON in "### Submission JSON" section
pattern = r"### Submission JSON\s*\n\s*```(?:json)?\s*\n([\s\S]*?)\n\s*```"
match = re.search(pattern, body)
if match:
return match.group(1).strip()
return None
def extract_contributor_name_from_issue_body(body: str) -> str | None:
"""
Extract contributor name from GitHub issue body.
Looks for the 'Contributor Name' field in the issue form.
Args:
body: The issue body text
Returns:
Contributor name string or None if not found/empty
"""
# Match "### Contributor Name" section
pattern = r"### Contributor Name\s*\n\s*(.+?)(?=\n###|\n\n|$)"
match = re.search(pattern, body)
if match:
name = match.group(1).strip()
# GitHub issue forms show "_No response_" for empty optional fields
if name and name != "_No response_":
return name
return None
def parse_and_validate(json_str: str, schema: dict | None = None) -> tuple[list | dict | None, list[str]]:
"""
Parse JSON string and validate against schema.
Args:
json_str: JSON string to parse
schema: Optional schema dict
Returns:
Tuple of (parsed data or None, list of errors)
"""
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
return None, [f"Invalid JSON: {e}"]
errors = validate_submission(data, schema)
return data, errors
+140
View File
@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Validate a community submission from a GitHub issue.
This script is called by the GitHub Actions workflow to validate
submissions when issues are opened or edited.
Usage:
python -m src.contributions.validate_submission --issue-body "..."
python -m src.contributions.validate_submission --file submission.json
echo '{"registration_number": "N12345"}' | python -m src.contributions.validate_submission --stdin
Environment variables (for GitHub Actions):
GITHUB_TOKEN: GitHub API token
GITHUB_REPOSITORY: owner/repo
ISSUE_NUMBER: Issue number to comment on
"""
import argparse
import json
import os
import sys
import urllib.request
import urllib.error
from .schema import extract_json_from_issue_body, parse_and_validate, load_schema
def github_api_request(method: str, endpoint: str, data: dict | None = None) -> dict:
"""Make a GitHub API request."""
token = os.environ.get("GITHUB_TOKEN")
repo = os.environ.get("GITHUB_REPOSITORY")
if not token or not repo:
raise EnvironmentError("GITHUB_TOKEN and GITHUB_REPOSITORY must be set")
url = f"https://api.github.com/repos/{repo}{endpoint}"
headers = {
"Authorization": f"token {token}",
"Accept": "application/vnd.github.v3+json",
"Content-Type": "application/json",
}
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(url, data=body, headers=headers, method=method)
with urllib.request.urlopen(req) as response:
return json.loads(response.read())
def add_issue_comment(issue_number: int, body: str) -> None:
"""Add a comment to a GitHub issue."""
github_api_request("POST", f"/issues/{issue_number}/comments", {"body": body})
def add_issue_label(issue_number: int, label: str) -> None:
"""Add a label to a GitHub issue."""
github_api_request("POST", f"/issues/{issue_number}/labels", {"labels": [label]})
def remove_issue_label(issue_number: int, label: str) -> None:
"""Remove a label from a GitHub issue."""
try:
github_api_request("DELETE", f"/issues/{issue_number}/labels/{label}")
except urllib.error.HTTPError:
pass # Label might not exist
def validate_and_report(json_str: str, issue_number: int | None = None) -> bool:
"""
Validate JSON and optionally report to GitHub issue.
Args:
json_str: JSON string to validate
issue_number: Optional issue number to comment on
Returns:
True if validation passed, False otherwise
"""
data, errors = parse_and_validate(json_str)
if errors:
error_list = "\n".join(f"- {e}" for e in errors)
message = f"❌ **Validation Failed**\n\n{error_list}\n\nPlease fix the errors and edit your submission."
print(message, file=sys.stderr)
if issue_number:
add_issue_comment(issue_number, message)
remove_issue_label(issue_number, "validated")
return False
count = len(data) if isinstance(data, list) else 1
message = f"✅ **Validation Passed**\n\n{count} submission(s) validated successfully against the schema.\n\nA maintainer can approve this submission by adding the `approved` label."
print(message)
if issue_number:
add_issue_comment(issue_number, message)
add_issue_label(issue_number, "validated")
return True
def main():
parser = argparse.ArgumentParser(description="Validate community submission JSON")
source_group = parser.add_mutually_exclusive_group(required=True)
source_group.add_argument("--issue-body", help="Issue body text containing JSON")
source_group.add_argument("--file", help="JSON file to validate")
source_group.add_argument("--stdin", action="store_true", help="Read JSON from stdin")
parser.add_argument("--issue-number", type=int, help="GitHub issue number to comment on")
args = parser.parse_args()
# Get JSON string
if args.issue_body:
json_str = extract_json_from_issue_body(args.issue_body)
if not json_str:
print("❌ Could not extract JSON from issue body", file=sys.stderr)
if args.issue_number:
add_issue_comment(
args.issue_number,
"❌ **Validation Failed**\n\nCould not extract JSON from submission. "
"Please ensure your JSON is in the 'Submission JSON' field wrapped in code blocks."
)
sys.exit(1)
elif args.file:
with open(args.file) as f:
json_str = f.read()
else: # stdin
json_str = sys.stdin.read()
# Validate
success = validate_and_report(json_str, args.issue_number)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()