# Copyright 2008-2021 pydicom authors. See LICENSE file for details. """Handle alternate character sets for character strings.""" import codecs import re from typing import ( TYPE_CHECKING, cast, ) from collections.abc import MutableSequence, Sequence from pydicom import config from pydicom.misc import warn_and_log from pydicom.valuerep import TEXT_VR_DELIMS, PersonName, VR, CUSTOMIZABLE_CHARSET_VR if TYPE_CHECKING: # pragma: no cover from pydicom.dataelem import DataElement # default encoding if no encoding defined - corresponds to ISO IR 6 / ASCII default_encoding = "iso8859" # Map DICOM Specific Character Set to python equivalent # https://docs.python.org/3/library/codecs.html#standard-encodings python_encoding = { # default character set for DICOM "": default_encoding, # alias for latin_1 too (iso_ir_6 exists as an alias to 'ascii') "ISO_IR 6": default_encoding, "ISO_IR 13": "shift_jis", "ISO_IR 100": "latin_1", "ISO_IR 101": "iso8859_2", "ISO_IR 109": "iso8859_3", "ISO_IR 110": "iso8859_4", "ISO_IR 126": "iso_ir_126", # Greek "ISO_IR 127": "iso_ir_127", # Arabic "ISO_IR 138": "iso_ir_138", # Hebrew "ISO_IR 144": "iso_ir_144", # Russian "ISO_IR 148": "iso_ir_148", # Turkish "ISO_IR 166": "iso_ir_166", # Thai "ISO 2022 IR 6": "iso8859", # alias for latin_1 too "ISO 2022 IR 13": "shift_jis", "ISO 2022 IR 87": "iso2022_jp", "ISO 2022 IR 100": "latin_1", "ISO 2022 IR 101": "iso8859_2", "ISO 2022 IR 109": "iso8859_3", "ISO 2022 IR 110": "iso8859_4", "ISO 2022 IR 126": "iso_ir_126", "ISO 2022 IR 127": "iso_ir_127", "ISO 2022 IR 138": "iso_ir_138", "ISO 2022 IR 144": "iso_ir_144", "ISO 2022 IR 148": "iso_ir_148", "ISO 2022 IR 149": "euc_kr", "ISO 2022 IR 159": "iso2022_jp_2", "ISO 2022 IR 166": "iso_ir_166", "ISO 2022 IR 58": "iso_ir_58", "ISO_IR 192": "UTF8", # from Chinese example, 2008 PS3.5 Annex J p1-4 "GB18030": "GB18030", "ISO 2022 GBK": "GBK", # from DICOM correction CP1234 "ISO 2022 58": "GB2312", # from DICOM correction CP1234 "GBK": "GBK", # from DICOM correction CP1234 } # these encodings cannot be used with code extensions # see DICOM Standard, Part 3, Table C.12-5 # and DICOM Standard, Part 5, Section 6.1.2.5.4, item d STAND_ALONE_ENCODINGS = ("ISO_IR 192", "GBK", "GB18030") # the escape character used to mark the start of escape sequences ESC = b"\x1b" # Map Python encodings to escape sequences as defined in PS3.3 in tables # C.12-3 (single-byte) and C.12-4 (multi-byte character sets). CODES_TO_ENCODINGS = { ESC + b"(B": default_encoding, # used to switch to ASCII G0 code element ESC + b"-A": "latin_1", ESC + b")I": "shift_jis", # switches to ISO-IR 13 ESC + b"(J": "shift_jis", # switches to ISO-IR 14 (shift_jis handles both) ESC + b"$B": "iso2022_jp", ESC + b"-B": "iso8859_2", ESC + b"-C": "iso8859_3", ESC + b"-D": "iso8859_4", ESC + b"-F": "iso_ir_126", ESC + b"-G": "iso_ir_127", ESC + b"-H": "iso_ir_138", ESC + b"-L": "iso_ir_144", ESC + b"-M": "iso_ir_148", ESC + b"-T": "iso_ir_166", ESC + b"$)C": "euc_kr", ESC + b"$(D": "iso2022_jp_2", ESC + b"$)A": "iso_ir_58", } ENCODINGS_TO_CODES = {v: k for k, v in CODES_TO_ENCODINGS.items()} ENCODINGS_TO_CODES["shift_jis"] = ESC + b")I" # Multi-byte character sets except Korean are handled by Python. # To decode them, the escape sequence shall be preserved in the input byte # string, and will be removed during decoding by Python. handled_encodings = ("iso2022_jp", "iso2022_jp_2", "iso_ir_58") def _encode_to_jis_x_0201(value: str, errors: str = "strict") -> bytes: """Convert a unicode string into JIS X 0201 byte string using shift_jis encodings. shift_jis is a superset of jis_x_0201. So we can regard the encoded value as jis_x_0201 if it is single byte character. Parameters ---------- value : str The unicode string as presented to the user. errors : str The behavior of a character which could not be encoded. If 'strict' is passed, raise an UnicodeEncodeError. If any other value is passed, non ISO IR 14 characters are replaced by the ASCII '?'. Returns ------- bytes The encoded string. If some characters in value could not be encoded to JIS X 0201, and `errors` is not set to 'strict', they are replaced to '?'. Raises ------ UnicodeEncodeError If errors is set to 'strict' and `value` could not be encoded with JIS X 0201. """ encoder_class = codecs.getincrementalencoder("shift_jis") encoder = encoder_class() # If errors is not strict, this function is used as fallback. # In this case, we use only ISO IR 14 to encode given value # without escape sequence. if errors != "strict" or value == "": encoded = b"" for c in value: try: b = encoder.encode(c) except UnicodeEncodeError: b = b"?" if len(b) != 1 or 0x80 <= ord(b): b = b"?" encoded += b return encoded encoded = encoder.encode(value[0]) if len(encoded) != 1: raise UnicodeEncodeError( "shift_jis", value, 0, len(value), "illegal multibyte sequence" ) msb = ord(encoded) & 0x80 # msb is 1 for ISO IR 13, 0 for ISO IR 14 for i, c in enumerate(value[1:], 1): try: b = encoder.encode(c) except UnicodeEncodeError as e: e.start = i e.end = len(value) raise e if len(b) != 1 or ((ord(b) & 0x80) ^ msb) != 0: character_set = "ISO IR 14" if msb == 0 else "ISO IR 13" msg = f"Given character is out of {character_set}" raise UnicodeEncodeError("shift_jis", value, i, len(value), msg) encoded += b return encoded def _encode_to_jis_x_0208(value: str, errors: str = "strict") -> bytes: """Convert a unicode string into JIS X 0208 encoded bytes.""" return _encode_to_given_charset(value, "ISO 2022 IR 87", errors=errors) def _encode_to_jis_x_0212(value: str, errors: str = "strict") -> bytes: """Convert a unicode string into JIS X 0212 encoded bytes.""" return _encode_to_given_charset(value, "ISO 2022 IR 159", errors=errors) def _encode_to_given_charset( value: str, character_set: str, errors: str = "strict" ) -> bytes: """Encode a unicode string using the given character set. The escape sequence which is located at the end of the encoded value has to vary depending on the value 1 of SpecificCharacterSet. So we have to trim it and append the correct escape sequence manually. Parameters ---------- value : text type The unicode string as presented to the user. character_set: str: Character set for result. errors : str The behavior of a character which could not be encoded. This value is passed to errors argument of str.encode(). Returns ------- bytes The encoded string. If some characters in value could not be encoded to given character_set, it depends on the behavior of corresponding python encoder. Raises ------ UnicodeEncodeError If errors is set to 'strict' and `value` could not be encoded with given character_set. """ encoding = python_encoding[character_set] # If errors is not strict, this function is used as fallback. # So keep the tail escape sequence of encoded for backward compatibility. if errors != "strict": return value.encode(encoding, errors=errors) encoder_class = codecs.getincrementalencoder(encoding) encoder = encoder_class() encoded = encoder.encode(value[0]) if not encoded.startswith(ENCODINGS_TO_CODES[encoding]): raise UnicodeEncodeError( encoding, value, 0, len(value), f"Given character is out of {character_set}" ) for i, c in enumerate(value[1:], 1): try: b = encoder.encode(c) except UnicodeEncodeError as e: e.start = i e.end = len(value) raise e if b[:1] == ESC: raise UnicodeEncodeError( encoding, value, i, len(value), f"Given character is out of {character_set}", ) encoded += b return encoded def _get_escape_sequence_for_encoding( encoding: str, encoded: bytes | None = None ) -> bytes: """Return an escape sequence corresponding to the given encoding. If encoding is 'shift_jis', return 'ESC)I' or 'ESC(J' depending on the first byte of encoded. Parameters ---------- encoding : str An encoding is used to specify an escape sequence. encoded : bytes The encoded value is used to choose an escape sequence if encoding is 'shift_jis'. Returns ------- bytes Escape sequence for encoded value. """ ESC_ISO_IR_14 = ESC + b"(J" ESC_ISO_IR_13 = ESC + b")I" if encoding == "shift_jis": if encoded is None: return ESC_ISO_IR_14 first_byte = encoded[0] if 0x80 <= first_byte: return ESC_ISO_IR_13 return ESC_ISO_IR_14 return ENCODINGS_TO_CODES.get(encoding, b"") # These encodings need escape sequence to handle alphanumeric characters. need_tail_escape_sequence_encodings = ("iso2022_jp", "iso2022_jp_2") custom_encoders = { "shift_jis": _encode_to_jis_x_0201, "iso2022_jp": _encode_to_jis_x_0208, "iso2022_jp_2": _encode_to_jis_x_0212, } def decode_bytes(value: bytes, encodings: Sequence[str], delimiters: set[int]) -> str: """Decode an encoded byte `value` into a unicode string using `encodings`. Parameters ---------- value : bytes The encoded byte string in the DICOM element value. encodings : list of str The encodings needed to decode the string as a list of Python encodings, converted from the encodings in (0008,0005) *Specific Character Set*. delimiters : set of int A set of characters or character codes, each of which resets the encoding in `value`. Returns ------- str The decoded unicode string. If the value could not be decoded, and :attr:`~pydicom.config.settings.reading_validation_mode` is not ``RAISE``, a warning is issued, and `value` is decoded using the first encoding with replacement characters, resulting in data loss. Raises ------ UnicodeDecodeError If :attr:`~pydicom.config.settings.reading_validation_mode` is ``RAISE`` and `value` could not be decoded with the given encodings. LookupError If :attr:`~pydicom.config.settings.reading_validation_mode` is ``RAISE`` and the given encodings are invalid. """ # shortcut for the common case - no escape sequences present if ESC not in value: first_encoding = encodings[0] try: return value.decode(first_encoding) except LookupError: if config.settings.reading_validation_mode == config.RAISE: raise # IGNORE is handled as WARN here, as this is # not an optional validation check warn_and_log( f"Unknown encoding '{first_encoding}' - using default " "encoding instead" ) first_encoding = default_encoding return value.decode(first_encoding) except UnicodeError: if config.settings.reading_validation_mode == config.RAISE: raise warn_and_log( "Failed to decode byte string with encoding " f"'{first_encoding}' - using replacement characters in " "decoded string" ) return value.decode(first_encoding, errors="replace") # Each part of the value that starts with an escape sequence is decoded # separately. If it starts with an escape sequence, the # corresponding encoding is used, otherwise (e.g. the first part if it # does not start with an escape sequence) the first encoding. # See PS3.5, 6.1.2.4 and 6.1.2.5 for the use of code extensions. # # The following regex splits the value into these parts, by matching # the substring until the first escape character, and subsequent # substrings starting with an escape character. regex = b"(^[^\x1b]+|[\x1b][^\x1b]*)" fragments: list[bytes] = re.findall(regex, value) # decode each byte string fragment with it's corresponding encoding # and join them all together return "".join( [_decode_fragment(fragment, encodings, delimiters) for fragment in fragments] ) decode_string = decode_bytes def _decode_fragment( byte_str: bytes, encodings: Sequence[str], delimiters: set[int] ) -> str: """Decode a byte string encoded with a single encoding. If `byte_str` starts with an escape sequence, the encoding corresponding to this sequence is used for decoding if present in `encodings`, otherwise the first value in encodings. If a delimiter occurs inside the string, it resets the encoding to the first encoding in case of single-byte encodings. Parameters ---------- byte_str : bytes The encoded string to be decoded. encodings: list of str The list of Python encodings as converted from the values in the Specific Character Set tag. delimiters: set of int A set of characters or character codes, each of which resets the encoding in `byte_str`. Returns ------- str The decoded unicode string. If the value could not be decoded, and :attr:`~pydicom.config.settings.reading_validation_mode` is not set to ``RAISE``, a warning is issued, and the value is decoded using the first encoding with replacement characters, resulting in data loss. Raises ------ UnicodeDecodeError If :attr:`~pydicom.config.settings.reading_validation_mode` is set to ``RAISE`` and `value` could not be decoded with the given encodings. References ---------- * DICOM Standard, Part 5, :dcm:`Sections 6.1.2.4` and :dcm:`6.1.2.5` * DICOM Standard, Part 3, :dcm:`Annex C.12.1.1.2` """ try: if byte_str.startswith(ESC): return _decode_escaped_fragment(byte_str, encodings, delimiters) # no escape sequence - use first encoding return byte_str.decode(encodings[0]) except UnicodeError: if config.settings.reading_validation_mode == config.RAISE: raise warn_and_log( "Failed to decode byte string with encodings: " f"{', '.join(encodings)} - using replacement characters in " "decoded string" ) return byte_str.decode(encodings[0], errors="replace") def _decode_escaped_fragment( byte_str: bytes, encodings: Sequence[str], delimiters: set[int] ) -> str: """Decodes a byte string starting with an escape sequence. See `_decode_fragment` for parameter description and more information. """ # all 4-character escape codes start with one of two character sets seq_length = 4 if byte_str.startswith((b"\x1b$(", b"\x1b$)")) else 3 encoding = CODES_TO_ENCODINGS.get(byte_str[:seq_length], "") if encoding in encodings or encoding == default_encoding: if encoding in handled_encodings: # Python strips the escape sequences for this encoding. # Any delimiters must be handled correctly by `byte_str`. return byte_str.decode(encoding) # Python doesn't know about the escape sequence - # we have to strip it before decoding byte_str = byte_str[seq_length:] # If a delimiter occurs in the string, it resets the encoding. # The following returns the first occurrence of a delimiter in # the byte string, or None if it does not contain any. index = next((idx for idx, ch in enumerate(byte_str) if ch in delimiters), None) if index is not None: # the part of the string after the first delimiter # is decoded with the first encoding return byte_str[:index].decode(encoding) + byte_str[index:].decode( encodings[0] ) # No delimiter - use the encoding defined by the escape code return byte_str.decode(encoding) # unknown escape code - use first encoding msg = "Found unknown escape sequence in encoded string value" if config.settings.reading_validation_mode == config.RAISE: raise ValueError(msg) warn_and_log(f"{msg} - using encoding {encodings[0]}") return byte_str.decode(encodings[0], errors="replace") def encode_string(value: str, encodings: Sequence[str]) -> bytes: """Encode a unicode string `value` into :class:`bytes` using `encodings`. Parameters ---------- value : str The unicode string as presented to the user. encodings : list of str The encodings needed to encode the string as a list of Python encodings, converted from the encodings in (0008,0005) *Specific Character Set*. Returns ------- bytes The encoded string. If `value` could not be encoded with any of the given encodings, and :attr:`~pydicom.config.settings.reading_validation_mode` is not ``RAISE``, a warning is issued, and `value` is encoded using the first encoding with replacement characters, resulting in data loss. Raises ------ UnicodeEncodeError If :attr:`~pydicom.config.settings.writing_validation_mode` is set to ``RAISE`` and `value` could not be encoded with the supplied encodings. """ for i, encoding in enumerate(encodings): try: encoded = _encode_string_impl(value, encoding) if i > 0 and encoding not in handled_encodings: escape_sequence = _get_escape_sequence_for_encoding( encoding, encoded=encoded ) encoded = escape_sequence + encoded if encoding in need_tail_escape_sequence_encodings: encoded += _get_escape_sequence_for_encoding(encodings[0]) return encoded except UnicodeError: continue # if we have more than one encoding, we retry encoding by splitting # `value` into chunks that can be encoded with one of the encodings if len(encodings) > 1: try: return _encode_string_parts(value, encodings) except ValueError: pass # all attempts failed - raise or warn and encode with replacement # characters if config.settings.writing_validation_mode == config.RAISE: # force raising a valid UnicodeEncodeError value.encode(encodings[0]) warn_and_log( f"Failed to encode value with encodings: {', '.join(encodings)} " "- using replacement characters in encoded string" ) return _encode_string_impl(value, encodings[0], errors="replace") def _encode_string_parts(value: str, encodings: Sequence[str]) -> bytes: """Convert a unicode string into a byte string using the given list of encodings. This is invoked if `encode_string` failed to encode `value` with a single encoding. We try instead to use different encodings for different parts of the string, using the encoding that can encode the longest part of the rest of the string as we go along. Parameters ---------- value : str The unicode string as presented to the user. encodings : list of str The encodings needed to encode the string as a list of Python encodings, converted from the encodings in Specific Character Set. Returns ------- bytes The encoded string, including the escape sequences needed to switch between different encodings. Raises ------ ValueError If `value` could not be encoded with the given encodings. """ encoded = bytearray() unencoded_part = value best_encoding = default_encoding while unencoded_part: # find the encoding that can encode the longest part of the rest # of the string still to be encoded max_index = 0 for encoding in encodings: try: _encode_string_impl(unencoded_part, encoding) # if we get here, the whole rest of the value can be encoded best_encoding = encoding max_index = len(unencoded_part) break except (UnicodeDecodeError, UnicodeEncodeError) as err: if err.start > max_index: # err.start is the index of first char we failed to encode max_index = err.start best_encoding = encoding # none of the given encodings can encode the first character - give up if max_index == 0: raise ValueError( "None of the given encodings can encode the first character" ) # encode the part that can be encoded with the found encoding encoded_part = _encode_string_impl(unencoded_part[:max_index], best_encoding) if best_encoding not in handled_encodings: encoded += _get_escape_sequence_for_encoding( best_encoding, encoded=encoded_part ) encoded += encoded_part # set remaining unencoded part of the string and handle that unencoded_part = unencoded_part[max_index:] # unencoded_part is empty - we are done, return the encoded string if best_encoding in need_tail_escape_sequence_encodings: encoded += _get_escape_sequence_for_encoding(encodings[0]) return bytes(encoded) def _encode_string_impl(value: str, encoding: str, errors: str = "strict") -> bytes: """Convert a unicode string into a byte string. If given encoding is in `custom_encoders`, use a corresponding `custom_encoder`. If given encoding is not in `custom_encoders`, use a corresponding python handled encoder. """ if encoding in custom_encoders: return custom_encoders[encoding](value, errors=errors) return value.encode(encoding, errors=errors) # DICOM PS3.5-2008 6.1.1 (p 18) says: # default is ISO-IR 6 G0, equiv to common chr set of ISO 8859 (PS3.5 6.1.2.1) # (0008,0005) value 1 can *replace* the default encoding... # for VRs of SH, LO, ST, LT, PN and UT (PS3.5 6.1.2.3)... # with a single-byte character encoding # if (0008,0005) is multi-valued, then value 1 (or default if blank)... # is used until code extension escape sequence is hit, # which can be at start of string, or after CR/LF, FF, or # in Person Name PN, after ^ or = # NOTE also that 7.5.3 SEQUENCE INHERITANCE states that if (0008,0005) # is not present in a sequence item then it is inherited from its parent. def convert_encodings(encodings: None | str | MutableSequence[str]) -> list[str]: """Convert DICOM `encodings` into corresponding Python encodings. Handles some common spelling mistakes and issues a warning in this case. Handles stand-alone encodings: if they are the first encodings, additional encodings are ignored, if they are not the first encoding, they are ignored. In both cases, a warning is issued. Invalid encodings are replaced with the default encoding with a respective warning issued, if :attr:`~pydicom.config.settings.reading_validation_mode` is ``WARN``, or an exception is raised if it is set to ``RAISE``. Parameters ---------- encodings : str or list of str The encoding or list of encodings as read from (0008,0005) *Specific Character Set*. Returns ------- list of str A :class:`list` of Python encodings corresponding to the DICOM encodings. If an encoding is already a Python encoding, it is returned unchanged. Encodings with common spelling errors are replaced by the correct encoding, and invalid encodings are replaced with the default encoding if :attr:`~pydicom.config.settings.reading_validation_mode` is not set to ``RAISE``. Raises ------ LookupError If `encodings` contains a value that could not be converted and :attr:`~pydicom.config.settings.reading_validation_mode` is ``RAISE``. """ encodings = encodings or [""] if isinstance(encodings, str): encodings = [encodings] else: # If a list if passed, we don't want to modify the list # in place so copy it encodings = encodings[:] if not encodings[0]: encodings[0] = "ISO_IR 6" py_encodings = [] for encoding in encodings: try: py_encodings.append(python_encoding[encoding]) except KeyError: py_encodings.append(_python_encoding_for_corrected_encoding(encoding)) if len(encodings) > 1: py_encodings = _handle_illegal_standalone_encodings(encodings, py_encodings) return py_encodings def _python_encoding_for_corrected_encoding(encoding: str) -> str: """Try to replace the given invalid encoding with a valid encoding by checking for common spelling errors, and return the correct Python encoding for that encoding. Otherwise check if the encoding is already a valid Python encoding, and return that. If both attempts fail, return the default encoding. Issue a warning for the invalid encoding except for the case where it is already converted. """ # standard encodings patched = None if re.match("^ISO[^_]IR", encoding) is not None: patched = "ISO_IR" + encoding[6:] # encodings with code extensions elif re.match("^(?=ISO.2022.IR.)(?!ISO 2022 IR )", encoding) is not None: patched = "ISO 2022 IR " + encoding[12:] if patched: # handle encoding patched for common spelling errors try: py_encoding = python_encoding[patched] _warn_about_invalid_encoding(encoding, patched) return py_encoding except KeyError: _warn_about_invalid_encoding(encoding) return default_encoding # fallback: assume that it is already a python encoding try: codecs.lookup(encoding) return encoding except LookupError: _warn_about_invalid_encoding(encoding) return default_encoding def _warn_about_invalid_encoding( encoding: str, patched_encoding: str | None = None ) -> None: """Issue a warning for the given invalid encoding. If patched_encoding is given, it is mentioned as the replacement encoding, other the default encoding. If no replacement encoding is given, and :attr:`~pydicom.config.settings.reading_validation_mode` is set to ``RAISE``, `LookupError` is raised. """ if patched_encoding is None: if config.settings.reading_validation_mode == config.RAISE: raise LookupError(f"Unknown encoding '{encoding}'") msg = f"Unknown encoding '{encoding}' - using default encoding instead" else: msg = ( f"Incorrect value for Specific Character Set '{encoding}' - " f"assuming '{patched_encoding}'" ) warn_and_log(msg, stacklevel=2) def _handle_illegal_standalone_encodings( encodings: MutableSequence[str], py_encodings: list[str] ) -> list[str]: """Check for stand-alone encodings in multi-valued encodings. If the first encoding is a stand-alone encoding, the rest of the encodings is removed. If any other encoding is a stand-alone encoding, it is removed from the encodings. """ if encodings[0] in STAND_ALONE_ENCODINGS: warn_and_log( ( f"Value '{encodings[0]}' for Specific Character Set does not " f"allow code extensions, ignoring: {', '.join(encodings[1:])}" ), stacklevel=2, ) return py_encodings[:1] for i, encoding in reversed(list(enumerate(encodings[1:]))): if encoding in STAND_ALONE_ENCODINGS: warn_and_log( f"Value '{encoding}' cannot be used as code extension, ignoring it", stacklevel=2, ) del py_encodings[i + 1] return py_encodings def decode_element( elem: "DataElement", dicom_character_set: str | list[str] | None ) -> None: """Apply the DICOM character encoding to a data element Parameters ---------- elem : dataelem.DataElement The :class:`DataElement` instance containing an encoded byte string value to decode. dicom_character_set : str or list of str or None The value of (0008,0005) *Specific Character Set*, which may be a single value, a multiple value (code extension), or may also be ``''`` or ``None``, in which case ``'ISO_IR 6'`` will be used. """ if elem.is_empty: return if not dicom_character_set: dicom_character_set = ["ISO_IR 6"] encodings = convert_encodings(dicom_character_set) # decode the string value to unicode # PN is special case as may have 3 components with different chr sets if elem.VR == VR.PN: if elem.VM == 1: # elem.value: PersonName | bytes elem.value = cast(PersonName, elem.value).decode(encodings) else: # elem.value: Iterable[PersonName | bytes] elem.value = [cast(PersonName, vv).decode(encodings) for vv in elem.value] elif elem.VR in CUSTOMIZABLE_CHARSET_VR: # You can't re-decode unicode (string literals in py3) if elem.VM == 1: if isinstance(elem.value, str): # already decoded return elem.value = decode_bytes(elem.value, encodings, TEXT_VR_DELIMS) else: output = list() for value in elem.value: if isinstance(value, str): output.append(value) else: output.append(decode_bytes(value, encodings, TEXT_VR_DELIMS)) elem.value = output