Source code for hissp.munger

# Copyright 2019, 2020, 2021, 2024 Matthew Egan Odendahl
# SPDX-License-Identifier: Apache-2.0
"""
Lissp's `symbol token` munger.

Encodes Lissp symbol tokens with special characters into valid,
human-readable (if unpythonic) Python identifiers,
using NFKC normalization and `Quotez`.

E.g. ``*FOO-BAR*`` becomes ``QzSTAR_FOOQzH_BARQzSTAR_``.

Quotez are written in upper case and wrapped in a ``Qz`` and ``_``.
This format was chosen because it contains an underscore
and both upper-case and lower-case letters,
which makes it distinct from
`standard Python naming conventions`__:
``lower_case_with_underscores``,
``UPPER_CASE_WITH_UNDERSCORES``,
and ``CapWords``,
as well as an extremely rare bigram, "Qz",
which makes the Quotez (but not the normalization)
reversible in the usual cases,
and also cannot introduce a leading underscore,
which can have special meaning in Python.

__ https://www.python.org/dev/peps/pep-0008/#naming-conventions

Characters can be encoded in one of three ways:
Short names, Unicode names, and ordinals.

The :func:`demunge` function will accept any of these encodings,
while the :func:`munge` function will prioritize short names,
then fall back to Unicode names, then fall back to ordinals.

Short names are given in the `TO_NAME` table in this module.

Any spaces in the Unicode names are replaced with an ``x`` and
any hyphens are replaced with an ``h``.
(Unicode names are in all caps and these substitutions are lower-case.)

Ordinals are given in a hexadecimal format like ``0XF00``.
"""

import re
import unicodedata
from collections.abc import Hashable, Mapping
from contextlib import suppress
from typing import TypeVar


[docs]def munge(s: str) -> str: """ Lissp's symbol munger. Encodes Lissp symbols with special characters into valid, human-readable (if unpythonic) Python identifiers, using NFKC normalization and `Quotez`. Full stops are handled separately, as those are meaningful to Hissp. """ # Always normalize identifiers: # >>> 𝐀 = 'MATHEMATICAL BOLD CAPITAL A' # >>> 'A' in globals() # True s = unicodedata.normalize("NFKC", s) if s.isidentifier(): return s # Nothing to munge. return ".".join(_munge_part(part) for part in s.split("."))
def _munge_part(part): if part: part = "".join(map(qz_encode, part)) if not part.isidentifier(): part = force_qz_encode(part[0]) + part[1:] assert part.isidentifier(), f"{part!r} is not identifier" return part QUOTEZ = "Qz{}_" """Format string for creating `Quotez`.""" FIND_QUOTEZ = re.compile(QUOTEZ.format("([0-9A-Z][0-9A-Zhx]*?)")) """Regex pattern to find `Quotez`. Used by `demunge`.""" TO_NAME = { k: QUOTEZ.format(v) for k, v in { # ASCII control characters don't munge to names. "!": "BANG", '"': "QUOT", "#": "HASH", "$": "DOLR", "%": "PCENT", "&": "ET", "'": "APOS", "(": "LPAR", ")": "RPAR", "*": "STAR", "+": "PLUS", # COMMA is fine. "-": "H", # Hyphen-minus ".": "DOT", # Doesn't munge by default. "/": "SOL", # Digits only munge if first character. # COLON is fine. ";": "SEMI", "<": "LT", # Less Than or LefT. "=": "EQ", ">": "GT", # Greater Than or riGhT. "?": "QUERY", "@": "AT", # Capital letters are always valid in Python identifiers. "[": "LSQB", "\\": "BSOL", "]": "RSQB", "^": "HAT", # Underscore is valid in Python identifiers. "`": "GRAVE", # Small letters are also always valid. "{": "LCUB", "|": "VERT", "}": "RCUB", # TILDE is fine. }.items() } """Shorter names for `Quotez`.""" _QZ_NAME = {ord(k): ord(v) for k, v in {" ": "x", "-": "h"}.items()}
[docs]def qz_encode(c: str) -> str: """ Converts a character to its `Quotez` encoding, unless it's already valid in a Python identifier. """ if ("x" + c).isidentifier(): return c return force_qz_encode(c)
[docs]def force_qz_encode(c: str) -> str: """ Converts a character to its `Quotez` encoding, even if it's valid in a Python identifier. """ with suppress(LookupError): return TO_NAME[c] with suppress(ValueError): return QUOTEZ.format(unicodedata.name(c).translate(_QZ_NAME)) return QUOTEZ.format(f"{ord(c):#X}")
K = TypeVar("K", bound=Hashable) V = TypeVar("V") def _inverse_1to1(mapping: Mapping[K, V]) -> dict[V, K]: result = {v: k for k, v in mapping.items()} assert len(mapping) == len(result) return result LOOKUP_NAME = _inverse_1to1(TO_NAME) """The inverse of `TO_NAME`.""" _UN_QZ_NAME = _inverse_1to1(_QZ_NAME) def _qz_decode(match: re.Match[str]) -> str: with suppress(KeyError): return LOOKUP_NAME[match.group()] with suppress(KeyError): return unicodedata.lookup(match.group(1).translate(_UN_QZ_NAME)) with suppress(ValueError): if match.group(1).startswith("0X"): return chr(int(match.group(1), 16)) return match.group()
[docs]def demunge(s: str) -> str: """The inverse of :func:`munge`. Decodes any `Quotez` into characters. Characters can be encoded in one of three ways: Short names, Unicode names, and ordinals. ``demunge`` will decode any of these. Even though :func:`munge` will consistently pick only one of these for any given character, which Unicode characters have names depends on the Python version. ``demunge`` will also leave the remaining text as-is, along with any invalid Quotez. >>> demunge("QzFOO_QzGT_QzHYPHENhMINUS_Qz0X3E_bar") 'QzFOO_>->bar' """ return FIND_QUOTEZ.sub(_qz_decode, s)