Source code for hissp.munger

# Copyright 2019, 2020, 2021 Matthew Egan Odendahl
# SPDX-License-Identifier: Apache-2.0
"""
Lissp's symbol munger.

Encodes Lissp symbols with special characters into valid,
human-readable (if unpythonic) Python identifiers,
using NFKC normalization and *Quotez*.

E.g. ``*FOO-BAR*`` becomes ``QzSTAR_FOOQz_BARQzSTAR_``.

Quotez are written in upper case and wrapped in a ``Qz`` and ``_``.
This format was chosen because it contains an underscore
and both upper-case and lower-case letters,
which makes it distinct from
`standard Python naming conventions`__:
``lower_case_with_underscores``,
``UPPER_CASE_WITH_UNDERSCORES``,
and ``CapWords``,
as well as an extremely rare bigram, "Qz",
which makes the Quotez (but not the normalization)
reversible in the usual cases,
and also cannot introduce a leading underscore,
which can have special meaning in Python.

__ https://www.python.org/dev/peps/pep-0008/#naming-conventions

Characters can be encoded in one of three ways:
Short names, Unicode names, and ordinals.

The `demunge` function will accept any of these encodings,
while the `munge` function will prioritize short names,
then fall back to Unicode names, then fall back to ordinals.

Short names are given in the `TO_NAME` table in this module.

Any spaces in the Unicode names are replaced with an ``x`` and
any hyphens are replaced with an ``h``.
(Unicode names are in all caps and these substitutions are lower-case.)

Ordinals are given in base 10.
"""

import re
import unicodedata
from contextlib import suppress
from typing import Dict, Hashable, Mapping, Match, TypeVar


[docs]def munge(s: str) -> str: """ Lissp's symbol munger. Encodes Lissp symbols with special characters into valid, human-readable (if unpythonic) Python identifiers, using NFKC normalization and *Quotez*. Inputs that begin with ``:`` are assumed to be control words and returned unmodified. Full stops are handled separately, as those are meaningful to Hissp. """ if s.startswith(":"): return s # control word return force_munge(s)
[docs]def force_munge(s: str) -> str: """As `munge`, but skips the control word check. Used for reader tags. """ # Always normalize identifiers: # >>> 𝐀 = 'MATHEMATICAL BOLD CAPITAL A' # >>> 'A' in globals() # True s = unicodedata.normalize("NFKC", s) if s.isidentifier(): return s # Nothing to munge. return ".".join(_munge_part(part) for part in s.split("."))
def _munge_part(part): if part: part = "".join(map(qz_encode, part)) if not part.isidentifier(): part = force_qz_encode(part[0]) + part[1:] assert part.isidentifier(), f"{part!r} is not identifier" return part QUOTEZ = "Qz{}_" """Format string for creating Quotez.""" FIND_QUOTEZ = re.compile(QUOTEZ.format("([0-9A-Z][0-9A-Zhx]*?)?")) """Regex pattern to find Quotez. Used by `demunge`.""" TO_NAME = { k: QUOTEZ.format(v) for k, v in { # ASCII control characters don't munge to names. "!": "BANG", '"': "QUOT", "#": "HASH", "$": "DOLR", "%": "PCENT", "&": "ET", "'": "APOS", "(": "LPAR", ")": "RPAR", "*": "STAR", "+": "PLUS", # COMMA is fine. "-": "", # Hyphen-minus # Full stop reserved for imports and attributes. "/": "SOL", # Digits only munge if first character. # COLON is fine. ";": "SEMI", "<": "LT", # Less Than or LefT. "=": "EQ", ">": "GT", # Greater Than or riGhT. "?": "QUERY", "@": "AT", # Capital letters are always valid in Python identifiers. "[": "LSQB", "\\": "BSOL", "]": "RSQB", "^": "HAT", # Underscore is valid in Python identifiers. "`": "GRAVE", # Small letters are also always valid. "{": "LCUB", "|": "VERT", "}": "RCUB", # TILDE is fine. }.items() } """Shorter names for Quotez.""" QZ_NAME = {ord(k): ord(v) for k, v in {" ": "x", "-": "h"}.items()}
[docs]def qz_encode(c: str) -> str: """ Converts a character to its Quotez encoding, unless it's already valid in a Python identifier. """ if ("x" + c).isidentifier(): return c return force_qz_encode(c)
[docs]def force_qz_encode(c: str) -> str: """ Converts a character to its Quotez encoding, even if it's valid in a Python identifier. """ with suppress(LookupError): return TO_NAME[c] with suppress(ValueError): return QUOTEZ.format(unicodedata.name(c).translate(QZ_NAME)) return QUOTEZ.format(ord(c))
K = TypeVar("K", bound=Hashable) V = TypeVar("V") def _inverse_1to1(mapping: Mapping[K, V]) -> Dict[V, K]: result = {v: k for k, v in mapping.items()} assert len(mapping) == len(result) return result LOOKUP_NAME = _inverse_1to1(TO_NAME) """The inverse of `TO_NAME`.""" UN_QZ_NAME = _inverse_1to1(QZ_NAME) def _qz_decode(match: Match[str]) -> str: with suppress(KeyError): return LOOKUP_NAME[match.group()] with suppress(KeyError): return unicodedata.lookup(match.group(1).translate(UN_QZ_NAME)) with suppress(ValueError): return chr(int(match.group(1))) return match.group()
[docs]def demunge(s: str) -> str: """The inverse of `munge`. Decodes any Quotez into characters. Characters can be encoded in one of three ways: Short names, Unicode names, and ordinals. `demunge` will decode any of these, even though `munge` will consistently pick only one of these for any given character. `demunge` will also leave the remaining text as-is, along with any invalid Quotez. >>> demunge("QzFOO_QzGT_QzHYPHENhMINUS_Qz62_bar") 'QzFOO_>->bar' """ return FIND_QUOTEZ.sub(_qz_decode, s)