diff --git a/CHANGELOG.md b/CHANGELOG.md index dcc4969..c1e472f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- JPEG params in `params.py` +- `clear_jpeg_metadata` function in `functions.py` - `clear_png_metadata` function in `functions.py` - `extract_metadata` function in `functions.py` ### Changed diff --git a/dmeta/functions.py b/dmeta/functions.py index 26bf821..4cff4db 100644 --- a/dmeta/functions.py +++ b/dmeta/functions.py @@ -9,7 +9,9 @@ from .errors import DMetaBaseError from .util import get_microsoft_format, extract, read_json from .params import CORE_XML_MAP, APP_XML_MAP, OVERVIEW, DMETA_VERSION, \ - UPDATE_COMMAND_WITH_NO_CONFIG_FILE_ERROR, SUPPORTED_MICROSOFT_FORMATS + UPDATE_COMMAND_WITH_NO_CONFIG_FILE_ERROR, SUPPORTED_MICROSOFT_FORMATS, \ + JPEG_MARKER_PREFIX, JPEG_SOI, JPEG_EOI, JPEG_SOS, JPEG_COM, \ + JPEG_APP_FIRST, JPEG_APP_LAST, JPEG_STANDALONE_MARKERS def overwrite_metadata( @@ -270,6 +272,68 @@ def clear_png_metadata(png_file_name, in_place=False, verbose=False): return output_path +def clear_jpeg_metadata(jpeg_file_name, in_place=False, verbose=False): + """ + Remove all metadata from a JPEG file without re-encoding pixel data. + + :param jpeg_file_name: path to original JPEG file + :type jpeg_file_name: str + :param in_place: if True, overwrite the original file with cleaned version + :type in_place: bool + :param verbose: if True, print detailed output + :type verbose: bool + :return: path to cleaned JPEG file + """ + if not os.path.exists(jpeg_file_name) or not jpeg_file_name.lower().endswith((".jpg", ".jpeg")): + return + + with open(jpeg_file_name, "rb") as f: + data = f.read() + soi = bytes([JPEG_MARKER_PREFIX, JPEG_SOI]) + if not data.startswith(soi): + return + + # Walk JPEG segments per ITU-T T.81 and drop APPn + COM (metadata holders). + out = bytearray(soi) + i, n = 2, len(data) + while i < n: + while i < n and data[i] == JPEG_MARKER_PREFIX: + i += 1 + if i >= n: + break + marker = data[i] + i += 1 + if marker in JPEG_STANDALONE_MARKERS: + out += bytes([JPEG_MARKER_PREFIX, marker]) + if marker == JPEG_EOI: + break + continue + length = (data[i] << 8) | data[i + 1] + payload = data[i:i + length] + i += length + if JPEG_APP_FIRST <= marker <= JPEG_APP_LAST or marker == JPEG_COM: + continue + out += bytes([JPEG_MARKER_PREFIX, marker]) + payload + if marker == JPEG_SOS: + out += data[i:] + break + + if in_place: + output_path = jpeg_file_name + else: + base, ext = os.path.splitext(jpeg_file_name) + output_path = base + "_cleaned" + ext + + with open(output_path, "wb") as f: + f.write(bytes(out)) + + if verbose: + action = "overwritten" if in_place else f"saved to {output_path}" + print(f"Metadata cleared for: {jpeg_file_name} ({action})") + + return output_path + + def extract_metadata(microsoft_file_name): """ Extract all the editable metadata from the given Microsoft file. diff --git a/dmeta/params.py b/dmeta/params.py index 3108c0f..4eccf93 100644 --- a/dmeta/params.py +++ b/dmeta/params.py @@ -36,6 +36,14 @@ "pptx", "xlsx" ] +# JPEG marker codes per ITU-T T.81. +JPEG_MARKER_PREFIX = 0xFF +JPEG_SOI = 0xD8 # Start Of Image +JPEG_EOI = 0xD9 # End Of Image +JPEG_SOS = 0xDA # Start Of Scan +JPEG_COM = 0xFE # Comment +JPEG_APP_FIRST, JPEG_APP_LAST = 0xE0, 0xEF # APP0..APP15 +JPEG_STANDALONE_MARKERS = frozenset({0x00, 0x01, JPEG_SOI, JPEG_EOI} | set(range(0xD0, 0xD8))) INVALID_CONFIG_FILE_NAME_ERROR = "Config file name is not a string." CONFIG_FILE_DOES_NOT_EXIST_ERROR = "Given config file doesn't exist." UPDATE_COMMAND_WITH_NO_CONFIG_FILE_ERROR = "No config file provided. Set the .json config file with --config command." diff --git a/tests/test.jpg b/tests/test.jpg new file mode 100644 index 0000000..28004d6 Binary files /dev/null and b/tests/test.jpg differ diff --git a/tests/test_dmeta.py b/tests/test_dmeta.py index 5c2b41e..776adda 100644 --- a/tests/test_dmeta.py +++ b/tests/test_dmeta.py @@ -1,6 +1,7 @@ import os from PIL import Image from dmeta.functions import update, update_all, clear, clear_all +from dmeta.functions import clear_jpeg_metadata from dmeta.functions import clear_png_metadata from dmeta.functions import extract_metadata @@ -76,3 +77,19 @@ def test10(): output_path = clear_png_metadata(png_file, in_place=False, verbose=False) with Image.open(output_path) as img: assert img.info == {} + + +def test11(): + # clear the metadata of the .jpg file [not inplace] + jpeg_file = os.path.join(TESTS_DIR_PATH, "test.jpg") + output_path = clear_jpeg_metadata(jpeg_file, in_place=False, verbose=False) + with Image.open(output_path) as img: + assert img.info == {} + + +def test12(): + # clear the metadata of the .jpg file [inplace] + jpeg_file = os.path.join(TESTS_DIR_PATH, "test.jpg") + clear_jpeg_metadata(jpeg_file, in_place=True, verbose=False) + with Image.open(jpeg_file) as img: + assert img.info == {}