842 lines
30 KiB
Python
Executable File
842 lines
30 KiB
Python
Executable File
# Copyright 2008-2021 pydicom authors. See LICENSE file for details.
|
|
"""Handle alternate character sets for character strings."""
|
|
|
|
import codecs
|
|
import re
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
cast,
|
|
)
|
|
from collections.abc import MutableSequence, Sequence
|
|
|
|
from pydicom import config
|
|
from pydicom.misc import warn_and_log
|
|
from pydicom.valuerep import TEXT_VR_DELIMS, PersonName, VR, CUSTOMIZABLE_CHARSET_VR
|
|
|
|
if TYPE_CHECKING: # pragma: no cover
|
|
from pydicom.dataelem import DataElement
|
|
|
|
|
|
# default encoding if no encoding defined - corresponds to ISO IR 6 / ASCII
|
|
default_encoding = "iso8859"
|
|
|
|
# Map DICOM Specific Character Set to python equivalent
|
|
# https://docs.python.org/3/library/codecs.html#standard-encodings
|
|
python_encoding = {
|
|
# default character set for DICOM
|
|
"": default_encoding,
|
|
# alias for latin_1 too (iso_ir_6 exists as an alias to 'ascii')
|
|
"ISO_IR 6": default_encoding,
|
|
"ISO_IR 13": "shift_jis",
|
|
"ISO_IR 100": "latin_1",
|
|
"ISO_IR 101": "iso8859_2",
|
|
"ISO_IR 109": "iso8859_3",
|
|
"ISO_IR 110": "iso8859_4",
|
|
"ISO_IR 126": "iso_ir_126", # Greek
|
|
"ISO_IR 127": "iso_ir_127", # Arabic
|
|
"ISO_IR 138": "iso_ir_138", # Hebrew
|
|
"ISO_IR 144": "iso_ir_144", # Russian
|
|
"ISO_IR 148": "iso_ir_148", # Turkish
|
|
"ISO_IR 166": "iso_ir_166", # Thai
|
|
"ISO 2022 IR 6": "iso8859", # alias for latin_1 too
|
|
"ISO 2022 IR 13": "shift_jis",
|
|
"ISO 2022 IR 87": "iso2022_jp",
|
|
"ISO 2022 IR 100": "latin_1",
|
|
"ISO 2022 IR 101": "iso8859_2",
|
|
"ISO 2022 IR 109": "iso8859_3",
|
|
"ISO 2022 IR 110": "iso8859_4",
|
|
"ISO 2022 IR 126": "iso_ir_126",
|
|
"ISO 2022 IR 127": "iso_ir_127",
|
|
"ISO 2022 IR 138": "iso_ir_138",
|
|
"ISO 2022 IR 144": "iso_ir_144",
|
|
"ISO 2022 IR 148": "iso_ir_148",
|
|
"ISO 2022 IR 149": "euc_kr",
|
|
"ISO 2022 IR 159": "iso2022_jp_2",
|
|
"ISO 2022 IR 166": "iso_ir_166",
|
|
"ISO 2022 IR 58": "iso_ir_58",
|
|
"ISO_IR 192": "UTF8", # from Chinese example, 2008 PS3.5 Annex J p1-4
|
|
"GB18030": "GB18030",
|
|
"ISO 2022 GBK": "GBK", # from DICOM correction CP1234
|
|
"ISO 2022 58": "GB2312", # from DICOM correction CP1234
|
|
"GBK": "GBK", # from DICOM correction CP1234
|
|
}
|
|
|
|
# these encodings cannot be used with code extensions
|
|
# see DICOM Standard, Part 3, Table C.12-5
|
|
# and DICOM Standard, Part 5, Section 6.1.2.5.4, item d
|
|
STAND_ALONE_ENCODINGS = ("ISO_IR 192", "GBK", "GB18030")
|
|
|
|
# the escape character used to mark the start of escape sequences
|
|
ESC = b"\x1b"
|
|
|
|
# Map Python encodings to escape sequences as defined in PS3.3 in tables
|
|
# C.12-3 (single-byte) and C.12-4 (multi-byte character sets).
|
|
CODES_TO_ENCODINGS = {
|
|
ESC + b"(B": default_encoding, # used to switch to ASCII G0 code element
|
|
ESC + b"-A": "latin_1",
|
|
ESC + b")I": "shift_jis", # switches to ISO-IR 13
|
|
ESC + b"(J": "shift_jis", # switches to ISO-IR 14 (shift_jis handles both)
|
|
ESC + b"$B": "iso2022_jp",
|
|
ESC + b"-B": "iso8859_2",
|
|
ESC + b"-C": "iso8859_3",
|
|
ESC + b"-D": "iso8859_4",
|
|
ESC + b"-F": "iso_ir_126",
|
|
ESC + b"-G": "iso_ir_127",
|
|
ESC + b"-H": "iso_ir_138",
|
|
ESC + b"-L": "iso_ir_144",
|
|
ESC + b"-M": "iso_ir_148",
|
|
ESC + b"-T": "iso_ir_166",
|
|
ESC + b"$)C": "euc_kr",
|
|
ESC + b"$(D": "iso2022_jp_2",
|
|
ESC + b"$)A": "iso_ir_58",
|
|
}
|
|
|
|
ENCODINGS_TO_CODES = {v: k for k, v in CODES_TO_ENCODINGS.items()}
|
|
ENCODINGS_TO_CODES["shift_jis"] = ESC + b")I"
|
|
|
|
# Multi-byte character sets except Korean are handled by Python.
|
|
# To decode them, the escape sequence shall be preserved in the input byte
|
|
# string, and will be removed during decoding by Python.
|
|
handled_encodings = ("iso2022_jp", "iso2022_jp_2", "iso_ir_58")
|
|
|
|
|
|
def _encode_to_jis_x_0201(value: str, errors: str = "strict") -> bytes:
|
|
"""Convert a unicode string into JIS X 0201 byte string using shift_jis
|
|
encodings.
|
|
shift_jis is a superset of jis_x_0201. So we can regard the encoded value
|
|
as jis_x_0201 if it is single byte character.
|
|
|
|
Parameters
|
|
----------
|
|
value : str
|
|
The unicode string as presented to the user.
|
|
errors : str
|
|
The behavior of a character which could not be encoded. If 'strict' is
|
|
passed, raise an UnicodeEncodeError. If any other value is passed,
|
|
non ISO IR 14 characters are replaced by the ASCII '?'.
|
|
|
|
Returns
|
|
-------
|
|
bytes
|
|
The encoded string. If some characters in value could not be encoded to
|
|
JIS X 0201, and `errors` is not set to 'strict', they are replaced to
|
|
'?'.
|
|
|
|
Raises
|
|
------
|
|
UnicodeEncodeError
|
|
If errors is set to 'strict' and `value` could not be encoded with
|
|
JIS X 0201.
|
|
"""
|
|
|
|
encoder_class = codecs.getincrementalencoder("shift_jis")
|
|
encoder = encoder_class()
|
|
|
|
# If errors is not strict, this function is used as fallback.
|
|
# In this case, we use only ISO IR 14 to encode given value
|
|
# without escape sequence.
|
|
if errors != "strict" or value == "":
|
|
encoded = b""
|
|
for c in value:
|
|
try:
|
|
b = encoder.encode(c)
|
|
except UnicodeEncodeError:
|
|
b = b"?"
|
|
|
|
if len(b) != 1 or 0x80 <= ord(b):
|
|
b = b"?"
|
|
encoded += b
|
|
return encoded
|
|
|
|
encoded = encoder.encode(value[0])
|
|
if len(encoded) != 1:
|
|
raise UnicodeEncodeError(
|
|
"shift_jis", value, 0, len(value), "illegal multibyte sequence"
|
|
)
|
|
|
|
msb = ord(encoded) & 0x80 # msb is 1 for ISO IR 13, 0 for ISO IR 14
|
|
for i, c in enumerate(value[1:], 1):
|
|
try:
|
|
b = encoder.encode(c)
|
|
except UnicodeEncodeError as e:
|
|
e.start = i
|
|
e.end = len(value)
|
|
raise e
|
|
if len(b) != 1 or ((ord(b) & 0x80) ^ msb) != 0:
|
|
character_set = "ISO IR 14" if msb == 0 else "ISO IR 13"
|
|
msg = f"Given character is out of {character_set}"
|
|
raise UnicodeEncodeError("shift_jis", value, i, len(value), msg)
|
|
encoded += b
|
|
|
|
return encoded
|
|
|
|
|
|
def _encode_to_jis_x_0208(value: str, errors: str = "strict") -> bytes:
|
|
"""Convert a unicode string into JIS X 0208 encoded bytes."""
|
|
return _encode_to_given_charset(value, "ISO 2022 IR 87", errors=errors)
|
|
|
|
|
|
def _encode_to_jis_x_0212(value: str, errors: str = "strict") -> bytes:
|
|
"""Convert a unicode string into JIS X 0212 encoded bytes."""
|
|
return _encode_to_given_charset(value, "ISO 2022 IR 159", errors=errors)
|
|
|
|
|
|
def _encode_to_given_charset(
|
|
value: str, character_set: str, errors: str = "strict"
|
|
) -> bytes:
|
|
"""Encode a unicode string using the given character set.
|
|
|
|
The escape sequence which is located at the end of the encoded value has
|
|
to vary depending on the value 1 of SpecificCharacterSet. So we have to
|
|
trim it and append the correct escape sequence manually.
|
|
|
|
Parameters
|
|
----------
|
|
value : text type
|
|
The unicode string as presented to the user.
|
|
character_set: str:
|
|
Character set for result.
|
|
errors : str
|
|
The behavior of a character which could not be encoded. This value
|
|
is passed to errors argument of str.encode().
|
|
|
|
Returns
|
|
-------
|
|
bytes
|
|
The encoded string. If some characters in value could not be encoded to
|
|
given character_set, it depends on the behavior of corresponding python
|
|
encoder.
|
|
|
|
Raises
|
|
------
|
|
UnicodeEncodeError
|
|
If errors is set to 'strict' and `value` could not be encoded with
|
|
given character_set.
|
|
"""
|
|
|
|
encoding = python_encoding[character_set]
|
|
# If errors is not strict, this function is used as fallback.
|
|
# So keep the tail escape sequence of encoded for backward compatibility.
|
|
if errors != "strict":
|
|
return value.encode(encoding, errors=errors)
|
|
|
|
encoder_class = codecs.getincrementalencoder(encoding)
|
|
encoder = encoder_class()
|
|
|
|
encoded = encoder.encode(value[0])
|
|
if not encoded.startswith(ENCODINGS_TO_CODES[encoding]):
|
|
raise UnicodeEncodeError(
|
|
encoding, value, 0, len(value), f"Given character is out of {character_set}"
|
|
)
|
|
|
|
for i, c in enumerate(value[1:], 1):
|
|
try:
|
|
b = encoder.encode(c)
|
|
except UnicodeEncodeError as e:
|
|
e.start = i
|
|
e.end = len(value)
|
|
raise e
|
|
if b[:1] == ESC:
|
|
raise UnicodeEncodeError(
|
|
encoding,
|
|
value,
|
|
i,
|
|
len(value),
|
|
f"Given character is out of {character_set}",
|
|
)
|
|
encoded += b
|
|
return encoded
|
|
|
|
|
|
def _get_escape_sequence_for_encoding(
|
|
encoding: str, encoded: bytes | None = None
|
|
) -> bytes:
|
|
"""Return an escape sequence corresponding to the given encoding. If
|
|
encoding is 'shift_jis', return 'ESC)I' or 'ESC(J' depending on the first
|
|
byte of encoded.
|
|
|
|
Parameters
|
|
----------
|
|
encoding : str
|
|
An encoding is used to specify an escape sequence.
|
|
encoded : bytes
|
|
The encoded value is used to choose an escape sequence if encoding is
|
|
'shift_jis'.
|
|
|
|
Returns
|
|
-------
|
|
bytes
|
|
Escape sequence for encoded value.
|
|
"""
|
|
|
|
ESC_ISO_IR_14 = ESC + b"(J"
|
|
ESC_ISO_IR_13 = ESC + b")I"
|
|
|
|
if encoding == "shift_jis":
|
|
if encoded is None:
|
|
return ESC_ISO_IR_14
|
|
|
|
first_byte = encoded[0]
|
|
if 0x80 <= first_byte:
|
|
return ESC_ISO_IR_13
|
|
|
|
return ESC_ISO_IR_14
|
|
return ENCODINGS_TO_CODES.get(encoding, b"")
|
|
|
|
|
|
# These encodings need escape sequence to handle alphanumeric characters.
|
|
need_tail_escape_sequence_encodings = ("iso2022_jp", "iso2022_jp_2")
|
|
|
|
|
|
custom_encoders = {
|
|
"shift_jis": _encode_to_jis_x_0201,
|
|
"iso2022_jp": _encode_to_jis_x_0208,
|
|
"iso2022_jp_2": _encode_to_jis_x_0212,
|
|
}
|
|
|
|
|
|
def decode_bytes(value: bytes, encodings: Sequence[str], delimiters: set[int]) -> str:
|
|
"""Decode an encoded byte `value` into a unicode string using `encodings`.
|
|
|
|
Parameters
|
|
----------
|
|
value : bytes
|
|
The encoded byte string in the DICOM element value.
|
|
encodings : list of str
|
|
The encodings needed to decode the string as a list of Python
|
|
encodings, converted from the encodings in (0008,0005) *Specific
|
|
Character Set*.
|
|
delimiters : set of int
|
|
A set of characters or character codes, each of which resets the
|
|
encoding in `value`.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
The decoded unicode string. If the value could not be decoded,
|
|
and :attr:`~pydicom.config.settings.reading_validation_mode`
|
|
is not ``RAISE``, a warning is issued, and `value` is
|
|
decoded using the first encoding with replacement characters,
|
|
resulting in data loss.
|
|
|
|
Raises
|
|
------
|
|
UnicodeDecodeError
|
|
If :attr:`~pydicom.config.settings.reading_validation_mode`
|
|
is ``RAISE`` and `value` could not be decoded with the given
|
|
encodings.
|
|
LookupError
|
|
If :attr:`~pydicom.config.settings.reading_validation_mode`
|
|
is ``RAISE`` and the given encodings are invalid.
|
|
"""
|
|
# shortcut for the common case - no escape sequences present
|
|
if ESC not in value:
|
|
first_encoding = encodings[0]
|
|
try:
|
|
return value.decode(first_encoding)
|
|
except LookupError:
|
|
if config.settings.reading_validation_mode == config.RAISE:
|
|
raise
|
|
# IGNORE is handled as WARN here, as this is
|
|
# not an optional validation check
|
|
warn_and_log(
|
|
f"Unknown encoding '{first_encoding}' - using default "
|
|
"encoding instead"
|
|
)
|
|
first_encoding = default_encoding
|
|
return value.decode(first_encoding)
|
|
except UnicodeError:
|
|
if config.settings.reading_validation_mode == config.RAISE:
|
|
raise
|
|
warn_and_log(
|
|
"Failed to decode byte string with encoding "
|
|
f"'{first_encoding}' - using replacement characters in "
|
|
"decoded string"
|
|
)
|
|
return value.decode(first_encoding, errors="replace")
|
|
|
|
# Each part of the value that starts with an escape sequence is decoded
|
|
# separately. If it starts with an escape sequence, the
|
|
# corresponding encoding is used, otherwise (e.g. the first part if it
|
|
# does not start with an escape sequence) the first encoding.
|
|
# See PS3.5, 6.1.2.4 and 6.1.2.5 for the use of code extensions.
|
|
#
|
|
# The following regex splits the value into these parts, by matching
|
|
# the substring until the first escape character, and subsequent
|
|
# substrings starting with an escape character.
|
|
regex = b"(^[^\x1b]+|[\x1b][^\x1b]*)"
|
|
fragments: list[bytes] = re.findall(regex, value)
|
|
|
|
# decode each byte string fragment with it's corresponding encoding
|
|
# and join them all together
|
|
return "".join(
|
|
[_decode_fragment(fragment, encodings, delimiters) for fragment in fragments]
|
|
)
|
|
|
|
|
|
decode_string = decode_bytes
|
|
|
|
|
|
def _decode_fragment(
|
|
byte_str: bytes, encodings: Sequence[str], delimiters: set[int]
|
|
) -> str:
|
|
"""Decode a byte string encoded with a single encoding.
|
|
|
|
If `byte_str` starts with an escape sequence, the encoding corresponding
|
|
to this sequence is used for decoding if present in `encodings`,
|
|
otherwise the first value in encodings.
|
|
If a delimiter occurs inside the string, it resets the encoding to the
|
|
first encoding in case of single-byte encodings.
|
|
|
|
Parameters
|
|
----------
|
|
byte_str : bytes
|
|
The encoded string to be decoded.
|
|
encodings: list of str
|
|
The list of Python encodings as converted from the values in the
|
|
Specific Character Set tag.
|
|
delimiters: set of int
|
|
A set of characters or character codes, each of which resets the
|
|
encoding in `byte_str`.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
The decoded unicode string. If the value could not be decoded,
|
|
and :attr:`~pydicom.config.settings.reading_validation_mode` is not
|
|
set to ``RAISE``, a warning is issued, and the value is
|
|
decoded using the first encoding with replacement characters,
|
|
resulting in data loss.
|
|
|
|
Raises
|
|
------
|
|
UnicodeDecodeError
|
|
If :attr:`~pydicom.config.settings.reading_validation_mode` is set
|
|
to ``RAISE`` and `value` could not be decoded with the given
|
|
encodings.
|
|
|
|
References
|
|
----------
|
|
* DICOM Standard, Part 5,
|
|
:dcm:`Sections 6.1.2.4<part05/chapter_6.html#sect_6.1.2.4>` and
|
|
:dcm:`6.1.2.5<part05/chapter_6.html#sect_6.1.2.5>`
|
|
* DICOM Standard, Part 3,
|
|
:dcm:`Annex C.12.1.1.2<part03/sect_C.12.html#sect_C.12.1.1.2>`
|
|
"""
|
|
try:
|
|
if byte_str.startswith(ESC):
|
|
return _decode_escaped_fragment(byte_str, encodings, delimiters)
|
|
# no escape sequence - use first encoding
|
|
return byte_str.decode(encodings[0])
|
|
except UnicodeError:
|
|
if config.settings.reading_validation_mode == config.RAISE:
|
|
raise
|
|
warn_and_log(
|
|
"Failed to decode byte string with encodings: "
|
|
f"{', '.join(encodings)} - using replacement characters in "
|
|
"decoded string"
|
|
)
|
|
return byte_str.decode(encodings[0], errors="replace")
|
|
|
|
|
|
def _decode_escaped_fragment(
|
|
byte_str: bytes, encodings: Sequence[str], delimiters: set[int]
|
|
) -> str:
|
|
"""Decodes a byte string starting with an escape sequence.
|
|
|
|
See `_decode_fragment` for parameter description and more information.
|
|
"""
|
|
# all 4-character escape codes start with one of two character sets
|
|
seq_length = 4 if byte_str.startswith((b"\x1b$(", b"\x1b$)")) else 3
|
|
encoding = CODES_TO_ENCODINGS.get(byte_str[:seq_length], "")
|
|
if encoding in encodings or encoding == default_encoding:
|
|
if encoding in handled_encodings:
|
|
# Python strips the escape sequences for this encoding.
|
|
# Any delimiters must be handled correctly by `byte_str`.
|
|
return byte_str.decode(encoding)
|
|
|
|
# Python doesn't know about the escape sequence -
|
|
# we have to strip it before decoding
|
|
byte_str = byte_str[seq_length:]
|
|
|
|
# If a delimiter occurs in the string, it resets the encoding.
|
|
# The following returns the first occurrence of a delimiter in
|
|
# the byte string, or None if it does not contain any.
|
|
index = next((idx for idx, ch in enumerate(byte_str) if ch in delimiters), None)
|
|
if index is not None:
|
|
# the part of the string after the first delimiter
|
|
# is decoded with the first encoding
|
|
return byte_str[:index].decode(encoding) + byte_str[index:].decode(
|
|
encodings[0]
|
|
)
|
|
|
|
# No delimiter - use the encoding defined by the escape code
|
|
return byte_str.decode(encoding)
|
|
|
|
# unknown escape code - use first encoding
|
|
msg = "Found unknown escape sequence in encoded string value"
|
|
if config.settings.reading_validation_mode == config.RAISE:
|
|
raise ValueError(msg)
|
|
|
|
warn_and_log(f"{msg} - using encoding {encodings[0]}")
|
|
return byte_str.decode(encodings[0], errors="replace")
|
|
|
|
|
|
def encode_string(value: str, encodings: Sequence[str]) -> bytes:
|
|
"""Encode a unicode string `value` into :class:`bytes` using `encodings`.
|
|
|
|
Parameters
|
|
----------
|
|
value : str
|
|
The unicode string as presented to the user.
|
|
encodings : list of str
|
|
The encodings needed to encode the string as a list of Python
|
|
encodings, converted from the encodings in (0008,0005) *Specific
|
|
Character Set*.
|
|
|
|
Returns
|
|
-------
|
|
bytes
|
|
The encoded string. If `value` could not be encoded with any of
|
|
the given encodings, and
|
|
:attr:`~pydicom.config.settings.reading_validation_mode` is not
|
|
``RAISE``, a warning is issued, and `value` is encoded using
|
|
the first encoding with replacement characters, resulting in data loss.
|
|
|
|
Raises
|
|
------
|
|
UnicodeEncodeError
|
|
If :attr:`~pydicom.config.settings.writing_validation_mode`
|
|
is set to ``RAISE`` and `value` could not be encoded with the
|
|
supplied encodings.
|
|
"""
|
|
for i, encoding in enumerate(encodings):
|
|
try:
|
|
encoded = _encode_string_impl(value, encoding)
|
|
|
|
if i > 0 and encoding not in handled_encodings:
|
|
escape_sequence = _get_escape_sequence_for_encoding(
|
|
encoding, encoded=encoded
|
|
)
|
|
encoded = escape_sequence + encoded
|
|
if encoding in need_tail_escape_sequence_encodings:
|
|
encoded += _get_escape_sequence_for_encoding(encodings[0])
|
|
return encoded
|
|
except UnicodeError:
|
|
continue
|
|
|
|
# if we have more than one encoding, we retry encoding by splitting
|
|
# `value` into chunks that can be encoded with one of the encodings
|
|
if len(encodings) > 1:
|
|
try:
|
|
return _encode_string_parts(value, encodings)
|
|
except ValueError:
|
|
pass
|
|
# all attempts failed - raise or warn and encode with replacement
|
|
# characters
|
|
if config.settings.writing_validation_mode == config.RAISE:
|
|
# force raising a valid UnicodeEncodeError
|
|
value.encode(encodings[0])
|
|
|
|
warn_and_log(
|
|
f"Failed to encode value with encodings: {', '.join(encodings)} "
|
|
"- using replacement characters in encoded string"
|
|
)
|
|
return _encode_string_impl(value, encodings[0], errors="replace")
|
|
|
|
|
|
def _encode_string_parts(value: str, encodings: Sequence[str]) -> bytes:
|
|
"""Convert a unicode string into a byte string using the given
|
|
list of encodings.
|
|
This is invoked if `encode_string` failed to encode `value` with a single
|
|
encoding. We try instead to use different encodings for different parts
|
|
of the string, using the encoding that can encode the longest part of
|
|
the rest of the string as we go along.
|
|
|
|
Parameters
|
|
----------
|
|
value : str
|
|
The unicode string as presented to the user.
|
|
encodings : list of str
|
|
The encodings needed to encode the string as a list of Python
|
|
encodings, converted from the encodings in Specific Character Set.
|
|
|
|
Returns
|
|
-------
|
|
bytes
|
|
The encoded string, including the escape sequences needed to switch
|
|
between different encodings.
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
If `value` could not be encoded with the given encodings.
|
|
|
|
"""
|
|
encoded = bytearray()
|
|
unencoded_part = value
|
|
best_encoding = default_encoding
|
|
while unencoded_part:
|
|
# find the encoding that can encode the longest part of the rest
|
|
# of the string still to be encoded
|
|
max_index = 0
|
|
for encoding in encodings:
|
|
try:
|
|
_encode_string_impl(unencoded_part, encoding)
|
|
# if we get here, the whole rest of the value can be encoded
|
|
best_encoding = encoding
|
|
max_index = len(unencoded_part)
|
|
break
|
|
except (UnicodeDecodeError, UnicodeEncodeError) as err:
|
|
if err.start > max_index:
|
|
# err.start is the index of first char we failed to encode
|
|
max_index = err.start
|
|
best_encoding = encoding
|
|
|
|
# none of the given encodings can encode the first character - give up
|
|
if max_index == 0:
|
|
raise ValueError(
|
|
"None of the given encodings can encode the first character"
|
|
)
|
|
|
|
# encode the part that can be encoded with the found encoding
|
|
encoded_part = _encode_string_impl(unencoded_part[:max_index], best_encoding)
|
|
if best_encoding not in handled_encodings:
|
|
encoded += _get_escape_sequence_for_encoding(
|
|
best_encoding, encoded=encoded_part
|
|
)
|
|
encoded += encoded_part
|
|
# set remaining unencoded part of the string and handle that
|
|
unencoded_part = unencoded_part[max_index:]
|
|
# unencoded_part is empty - we are done, return the encoded string
|
|
if best_encoding in need_tail_escape_sequence_encodings:
|
|
encoded += _get_escape_sequence_for_encoding(encodings[0])
|
|
|
|
return bytes(encoded)
|
|
|
|
|
|
def _encode_string_impl(value: str, encoding: str, errors: str = "strict") -> bytes:
|
|
"""Convert a unicode string into a byte string.
|
|
|
|
If given encoding is in `custom_encoders`, use a corresponding
|
|
`custom_encoder`. If given encoding is not in `custom_encoders`, use a
|
|
corresponding python handled encoder.
|
|
"""
|
|
if encoding in custom_encoders:
|
|
return custom_encoders[encoding](value, errors=errors)
|
|
|
|
return value.encode(encoding, errors=errors)
|
|
|
|
|
|
# DICOM PS3.5-2008 6.1.1 (p 18) says:
|
|
# default is ISO-IR 6 G0, equiv to common chr set of ISO 8859 (PS3.5 6.1.2.1)
|
|
# (0008,0005) value 1 can *replace* the default encoding...
|
|
# for VRs of SH, LO, ST, LT, PN and UT (PS3.5 6.1.2.3)...
|
|
# with a single-byte character encoding
|
|
# if (0008,0005) is multi-valued, then value 1 (or default if blank)...
|
|
# is used until code extension escape sequence is hit,
|
|
# which can be at start of string, or after CR/LF, FF, or
|
|
# in Person Name PN, after ^ or =
|
|
# NOTE also that 7.5.3 SEQUENCE INHERITANCE states that if (0008,0005)
|
|
# is not present in a sequence item then it is inherited from its parent.
|
|
|
|
|
|
def convert_encodings(encodings: None | str | MutableSequence[str]) -> list[str]:
|
|
"""Convert DICOM `encodings` into corresponding Python encodings.
|
|
|
|
Handles some common spelling mistakes and issues a warning in this case.
|
|
|
|
Handles stand-alone encodings: if they are the first encodings,
|
|
additional encodings are ignored, if they are not the first encoding,
|
|
they are ignored. In both cases, a warning is issued.
|
|
|
|
Invalid encodings are replaced with the default encoding with a
|
|
respective warning issued, if
|
|
:attr:`~pydicom.config.settings.reading_validation_mode` is
|
|
``WARN``, or an exception is raised if it is set to
|
|
``RAISE``.
|
|
|
|
Parameters
|
|
----------
|
|
encodings : str or list of str
|
|
The encoding or list of encodings as read from (0008,0005)
|
|
*Specific Character Set*.
|
|
|
|
Returns
|
|
-------
|
|
list of str
|
|
A :class:`list` of Python encodings corresponding to the DICOM
|
|
encodings. If an encoding is already a Python encoding, it is returned
|
|
unchanged. Encodings with common spelling errors are replaced by the
|
|
correct encoding, and invalid encodings are replaced with the default
|
|
encoding if :attr:`~pydicom.config.settings.reading_validation_mode`
|
|
is not set to ``RAISE``.
|
|
|
|
Raises
|
|
------
|
|
LookupError
|
|
If `encodings` contains a value that could not be converted and
|
|
:attr:`~pydicom.config.settings.reading_validation_mode` is
|
|
``RAISE``.
|
|
"""
|
|
|
|
encodings = encodings or [""]
|
|
if isinstance(encodings, str):
|
|
encodings = [encodings]
|
|
else:
|
|
# If a list if passed, we don't want to modify the list
|
|
# in place so copy it
|
|
encodings = encodings[:]
|
|
if not encodings[0]:
|
|
encodings[0] = "ISO_IR 6"
|
|
|
|
py_encodings = []
|
|
for encoding in encodings:
|
|
try:
|
|
py_encodings.append(python_encoding[encoding])
|
|
except KeyError:
|
|
py_encodings.append(_python_encoding_for_corrected_encoding(encoding))
|
|
|
|
if len(encodings) > 1:
|
|
py_encodings = _handle_illegal_standalone_encodings(encodings, py_encodings)
|
|
|
|
return py_encodings
|
|
|
|
|
|
def _python_encoding_for_corrected_encoding(encoding: str) -> str:
|
|
"""Try to replace the given invalid encoding with a valid encoding by
|
|
checking for common spelling errors, and return the correct Python
|
|
encoding for that encoding. Otherwise check if the
|
|
encoding is already a valid Python encoding, and return that. If both
|
|
attempts fail, return the default encoding.
|
|
Issue a warning for the invalid encoding except for the case where it is
|
|
already converted.
|
|
"""
|
|
# standard encodings
|
|
patched = None
|
|
if re.match("^ISO[^_]IR", encoding) is not None:
|
|
patched = "ISO_IR" + encoding[6:]
|
|
# encodings with code extensions
|
|
elif re.match("^(?=ISO.2022.IR.)(?!ISO 2022 IR )", encoding) is not None:
|
|
patched = "ISO 2022 IR " + encoding[12:]
|
|
|
|
if patched:
|
|
# handle encoding patched for common spelling errors
|
|
try:
|
|
py_encoding = python_encoding[patched]
|
|
_warn_about_invalid_encoding(encoding, patched)
|
|
return py_encoding
|
|
except KeyError:
|
|
_warn_about_invalid_encoding(encoding)
|
|
return default_encoding
|
|
|
|
# fallback: assume that it is already a python encoding
|
|
try:
|
|
codecs.lookup(encoding)
|
|
return encoding
|
|
except LookupError:
|
|
_warn_about_invalid_encoding(encoding)
|
|
return default_encoding
|
|
|
|
|
|
def _warn_about_invalid_encoding(
|
|
encoding: str, patched_encoding: str | None = None
|
|
) -> None:
|
|
"""Issue a warning for the given invalid encoding.
|
|
If patched_encoding is given, it is mentioned as the
|
|
replacement encoding, other the default encoding.
|
|
If no replacement encoding is given, and
|
|
:attr:`~pydicom.config.settings.reading_validation_mode` is set to
|
|
``RAISE``, `LookupError` is raised.
|
|
"""
|
|
if patched_encoding is None:
|
|
if config.settings.reading_validation_mode == config.RAISE:
|
|
raise LookupError(f"Unknown encoding '{encoding}'")
|
|
|
|
msg = f"Unknown encoding '{encoding}' - using default encoding instead"
|
|
else:
|
|
msg = (
|
|
f"Incorrect value for Specific Character Set '{encoding}' - "
|
|
f"assuming '{patched_encoding}'"
|
|
)
|
|
warn_and_log(msg, stacklevel=2)
|
|
|
|
|
|
def _handle_illegal_standalone_encodings(
|
|
encodings: MutableSequence[str], py_encodings: list[str]
|
|
) -> list[str]:
|
|
"""Check for stand-alone encodings in multi-valued encodings.
|
|
If the first encoding is a stand-alone encoding, the rest of the
|
|
encodings is removed. If any other encoding is a stand-alone encoding,
|
|
it is removed from the encodings.
|
|
"""
|
|
if encodings[0] in STAND_ALONE_ENCODINGS:
|
|
warn_and_log(
|
|
(
|
|
f"Value '{encodings[0]}' for Specific Character Set does not "
|
|
f"allow code extensions, ignoring: {', '.join(encodings[1:])}"
|
|
),
|
|
stacklevel=2,
|
|
)
|
|
return py_encodings[:1]
|
|
|
|
for i, encoding in reversed(list(enumerate(encodings[1:]))):
|
|
if encoding in STAND_ALONE_ENCODINGS:
|
|
warn_and_log(
|
|
f"Value '{encoding}' cannot be used as code extension, ignoring it",
|
|
stacklevel=2,
|
|
)
|
|
del py_encodings[i + 1]
|
|
|
|
return py_encodings
|
|
|
|
|
|
def decode_element(
|
|
elem: "DataElement", dicom_character_set: str | list[str] | None
|
|
) -> None:
|
|
"""Apply the DICOM character encoding to a data element
|
|
|
|
Parameters
|
|
----------
|
|
elem : dataelem.DataElement
|
|
The :class:`DataElement<pydicom.dataelem.DataElement>` instance
|
|
containing an encoded byte string value to decode.
|
|
dicom_character_set : str or list of str or None
|
|
The value of (0008,0005) *Specific Character Set*, which may be a
|
|
single value, a multiple value (code extension), or may also be ``''``
|
|
or ``None``, in which case ``'ISO_IR 6'`` will be used.
|
|
"""
|
|
if elem.is_empty:
|
|
return
|
|
|
|
if not dicom_character_set:
|
|
dicom_character_set = ["ISO_IR 6"]
|
|
|
|
encodings = convert_encodings(dicom_character_set)
|
|
|
|
# decode the string value to unicode
|
|
# PN is special case as may have 3 components with different chr sets
|
|
if elem.VR == VR.PN:
|
|
if elem.VM == 1:
|
|
# elem.value: PersonName | bytes
|
|
elem.value = cast(PersonName, elem.value).decode(encodings)
|
|
else:
|
|
# elem.value: Iterable[PersonName | bytes]
|
|
elem.value = [cast(PersonName, vv).decode(encodings) for vv in elem.value]
|
|
elif elem.VR in CUSTOMIZABLE_CHARSET_VR:
|
|
# You can't re-decode unicode (string literals in py3)
|
|
if elem.VM == 1:
|
|
if isinstance(elem.value, str):
|
|
# already decoded
|
|
return
|
|
elem.value = decode_bytes(elem.value, encodings, TEXT_VR_DELIMS)
|
|
else:
|
|
output = list()
|
|
for value in elem.value:
|
|
if isinstance(value, str):
|
|
output.append(value)
|
|
else:
|
|
output.append(decode_bytes(value, encodings, TEXT_VR_DELIMS))
|
|
|
|
elem.value = output
|