mirror of
https://github.com/wiltodelta/remove-ai-watermarks.git
synced 2026-06-05 10:38:00 +02:00
5d0e6c3a65
Apply fixes from a full-repo review (code, tests, docs). Security / correctness: - Clamp attacker-controlled PNG/caBX chunk lengths to the remaining file size in metadata.py and noai/c2pa.py (a malformed length no longer drives a multi-GB read); skipped chunks seek instead of read. - noai/isobmff.strip_c2pa_boxes is now fail-safe on a malformed box: return the original bytes with a warning instead of silently truncating the tail, so metadata --remove can no longer emit a corrupt file. - doubao_engine._fixed_alpha_map clamps the glyph box to the image (no crash on degenerate width-vs-height). - watermark_remover._run_region_hires gates the phaseCorrelate offset on response and magnitude (a spurious shift no longer garbles text) and drops the generator after a CPU fallback (no MPS/CPU device mismatch). Robustness: - gemini_engine, doubao_engine, region_eraser normalize grayscale and RGBA inputs to BGR at the engine entry points. - image_io.imwrite returns False on an unwritable path (matches cv2). - invisible_engine guards a None imread result before use. - trustmark_detector._decoder uses a double-checked threading lock. - ctrlregen.tiling.tile_positions raises on overlap >= tile. - humanizer chromatic shift no longer wraps opposite-edge pixels. - identify OpenAI caveat keyed on the normalized vendor, not a substring. - Remove the dead "visible --detect-threshold" CLI option. - publish.yml verifies the release tag matches the package version. Docs: - README strength 0.05 to 0.10; .env.example HF_TOKEN marked optional; doubao_capture README updated to reverse-alpha-only; CLAUDE.md synced with the new behaviors and the batch command. Tests: new test_security_clamp.py for the read clamp and isobmff fail-safe; erase CLI coverage; integrity-clash rule 2 end-to-end; multi-tag EXIF survival and cross-format strip guards; channel/size, tiling, humanizer, and imwrite regressions. Full suite 493 passed, 2 skipped; ruff and pyright src/ clean. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
131 lines
4.7 KiB
Python
131 lines
4.7 KiB
Python
"""Regression guards for malformed-length DoS and removal-truncation bugs.
|
|
|
|
Three verified bugs are locked in here:
|
|
|
|
1. PNG C2PA parsers (``c2pa.has_c2pa_metadata`` / ``extract_c2pa_info`` and
|
|
``metadata._png_late_metadata`` via ``scan_head``) used the raw 32-bit chunk
|
|
``length`` field directly in ``f.read(length)``. A crafted file can declare
|
|
``length = 0x7FFFFFFF`` (~2 GiB) on a 60-byte file, forcing a multi-GB
|
|
allocation. The fix clamps ``length`` to the bytes actually remaining.
|
|
|
|
2. ISOBMFF ``strip_c2pa_boxes`` truncated the file from a malformed box to EOF
|
|
(the box walker returns early), so ``remove_ai_metadata`` could emit a
|
|
shorter file and report success. The fix returns the input unchanged when the
|
|
walk does not reach EOF.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import struct
|
|
import tracemalloc
|
|
|
|
from remove_ai_watermarks import metadata
|
|
from remove_ai_watermarks.noai import c2pa, isobmff
|
|
|
|
PNG_SIG = b"\x89PNG\r\n\x1a\n"
|
|
_HUGE = 0x7FFFFFFF # ~2 GiB declared length on a tiny file
|
|
|
|
|
|
def _png_with_huge_c2pa_chunk() -> bytes:
|
|
"""A ~60-byte 'PNG' whose caBX chunk header lies about its length."""
|
|
header = struct.pack(">I", _HUGE) + c2pa.C2PA_CHUNK_TYPE
|
|
body = b"jumbc2pa-not-really" # far shorter than the declared length
|
|
return PNG_SIG + header + body
|
|
|
|
|
|
class TestPngLengthClampNoAlloc:
|
|
"""Clamping makes the parsers read only the real bytes, not the lie."""
|
|
|
|
def test_has_c2pa_metadata_is_bounded(self, tmp_path):
|
|
path = tmp_path / "evil.png"
|
|
path.write_bytes(_png_with_huge_c2pa_chunk())
|
|
|
|
tracemalloc.start()
|
|
try:
|
|
# Must return quickly without allocating gigabytes and without raising.
|
|
c2pa.has_c2pa_metadata(path)
|
|
_, peak = tracemalloc.get_traced_memory()
|
|
finally:
|
|
tracemalloc.stop()
|
|
assert peak < 50 * 1024 * 1024 # < 50 MB locks in the clamp
|
|
|
|
def test_extract_c2pa_info_is_bounded(self, tmp_path):
|
|
path = tmp_path / "evil.png"
|
|
path.write_bytes(_png_with_huge_c2pa_chunk())
|
|
|
|
tracemalloc.start()
|
|
try:
|
|
c2pa.extract_c2pa_info(path)
|
|
_, peak = tracemalloc.get_traced_memory()
|
|
finally:
|
|
tracemalloc.stop()
|
|
assert peak < 50 * 1024 * 1024
|
|
|
|
def test_extract_c2pa_chunk_is_bounded(self, tmp_path):
|
|
path = tmp_path / "evil.png"
|
|
path.write_bytes(_png_with_huge_c2pa_chunk())
|
|
|
|
tracemalloc.start()
|
|
try:
|
|
c2pa.extract_c2pa_chunk(path)
|
|
_, peak = tracemalloc.get_traced_memory()
|
|
finally:
|
|
tracemalloc.stop()
|
|
assert peak < 50 * 1024 * 1024
|
|
|
|
def test_png_late_metadata_scan_is_bounded(self, tmp_path):
|
|
# A PNG with a real IDAT pushing the late-scan window past 1 MB, then a
|
|
# tEXt chunk lying about its length. scan_head() -> _png_late_metadata().
|
|
idat = b"\x00" * (1024 * 1024 + 16)
|
|
text_header = struct.pack(">I", _HUGE) + b"tEXt"
|
|
blob = (
|
|
PNG_SIG
|
|
+ struct.pack(">I", len(idat))
|
|
+ b"IDAT"
|
|
+ idat
|
|
+ b"\x00\x00\x00\x00" # fake CRC
|
|
+ text_header
|
|
+ b"AIGC short"
|
|
)
|
|
path = tmp_path / "evil_late.png"
|
|
path.write_bytes(blob)
|
|
|
|
tracemalloc.start()
|
|
try:
|
|
metadata.scan_head(path)
|
|
_, peak = tracemalloc.get_traced_memory()
|
|
finally:
|
|
tracemalloc.stop()
|
|
# head itself is ~1 MB; the clamp keeps the late read tiny. Generous cap.
|
|
assert peak < 50 * 1024 * 1024
|
|
|
|
|
|
def _box(box_type: bytes, payload: bytes) -> bytes:
|
|
return struct.pack(">I", 8 + len(payload)) + box_type + payload
|
|
|
|
|
|
class TestIsobmffStripFailSafe:
|
|
def test_well_formed_file_still_strips_uuid(self):
|
|
ftyp = _box(b"ftyp", b"isom\x00\x00\x00\x00mp42")
|
|
c2pa_box = _box(b"uuid", isobmff.C2PA_UUID + b"manifest-bytes")
|
|
mdat = _box(b"mdat", b"\x00" * 32)
|
|
data = ftyp + c2pa_box + mdat
|
|
|
|
cleaned, stripped = isobmff.strip_c2pa_boxes(data)
|
|
assert stripped == 1
|
|
assert len(cleaned) == len(data) - len(c2pa_box)
|
|
assert isobmff.C2PA_UUID not in cleaned
|
|
|
|
def test_malformed_box_does_not_truncate_tail(self):
|
|
ftyp = _box(b"ftyp", b"isom\x00\x00\x00\x00mp42")
|
|
c2pa_box = _box(b"uuid", isobmff.C2PA_UUID + b"manifest-bytes")
|
|
# A box claiming ~2 GiB before EOF: the walker returns early here.
|
|
bad_box = struct.pack(">I", _HUGE) + b"free" + b"\x00" * 16
|
|
data = ftyp + c2pa_box + bad_box
|
|
|
|
cleaned, stripped = isobmff.strip_c2pa_boxes(data)
|
|
# Fail-safe: input returned unchanged, nothing stripped, no truncation.
|
|
assert stripped == 0
|
|
assert cleaned == data
|
|
assert len(cleaned) == len(data)
|