# Copyright 2019, 2020, 2021 Matthew Egan Odendahl
# SPDX-License-Identifier: Apache-2.0
"""
Lissp's symbol munger.
Encodes Lissp symbols with special characters into valid,
human-readable (if unpythonic) Python identifiers,
using NFKC normalization and *Quotez*.
E.g. ``*FOO-BAR*`` becomes ``QzSTAR_FOOQz_BARQzSTAR_``.
Quotez are written in upper case and wrapped in a ``Qz`` and ``_``.
This format was chosen because it contains an underscore
and both upper-case and lower-case letters,
which makes it distinct from
`standard Python naming conventions`__:
``lower_case_with_underscores``,
``UPPER_CASE_WITH_UNDERSCORES``,
and ``CapWords``,
as well as an extremely rare bigram, "Qz",
which makes the Quotez (but not the normalization)
reversible in the usual cases,
and also cannot introduce a leading underscore,
which can have special meaning in Python.
__ https://www.python.org/dev/peps/pep-0008/#naming-conventions
Characters can be encoded in one of three ways:
Short names, Unicode names, and ordinals.
The `demunge` function will accept any of these encodings,
while the `munge` function will prioritize short names,
then fall back to Unicode names, then fall back to ordinals.
Short names are given in the `TO_NAME` table in this module.
Any spaces in the Unicode names are replaced with an ``x`` and
any hyphens are replaced with an ``h``.
(Unicode names are in all caps and these substitutions are lower-case.)
Ordinals are given in base 10.
"""
import re
import unicodedata
from contextlib import suppress
from typing import Dict, Hashable, Mapping, Match, TypeVar
[docs]def munge(s: str) -> str:
"""
Lissp's symbol munger.
Encodes Lissp symbols with special characters into valid,
human-readable (if unpythonic) Python identifiers,
using NFKC normalization and *Quotez*.
Inputs that begin with ``:`` are assumed to be control words
and returned unmodified.
Full stops are handled separately, as those are meaningful to Hissp.
"""
if s.startswith(":"):
return s # control word
return force_munge(s)
[docs]def force_munge(s: str) -> str:
"""As `munge`, but skips the control word check.
Used for reader tags.
"""
# Always normalize identifiers:
# >>> 𝐀 = 'MATHEMATICAL BOLD CAPITAL A'
# >>> 'A' in globals()
# True
s = unicodedata.normalize("NFKC", s)
if s.isidentifier():
return s # Nothing to munge.
return ".".join(_munge_part(part) for part in s.split("."))
def _munge_part(part):
if part:
part = "".join(map(qz_encode, part))
if not part.isidentifier():
part = force_qz_encode(part[0]) + part[1:]
assert part.isidentifier(), f"{part!r} is not identifier"
return part
QUOTEZ = "Qz{}_"
"""Format string for creating Quotez."""
FIND_QUOTEZ = re.compile(QUOTEZ.format("([0-9A-Z][0-9A-Zhx]*?)?"))
"""Regex pattern to find Quotez. Used by `demunge`."""
TO_NAME = {
k: QUOTEZ.format(v)
for k, v in {
# ASCII control characters don't munge to names.
"!": "BANG",
'"': "QUOT",
"#": "HASH",
"$": "DOLR",
"%": "PCENT",
"&": "ET",
"'": "APOS",
"(": "LPAR",
")": "RPAR",
"*": "STAR",
"+": "PLUS",
# COMMA is fine.
"-": "", # Hyphen-minus
# Full stop reserved for imports and attributes.
"/": "SOL",
# Digits only munge if first character.
# COLON is fine.
";": "SEMI",
"<": "LT", # Less Than or LefT.
"=": "EQ",
">": "GT", # Greater Than or riGhT.
"?": "QUERY",
"@": "AT",
# Capital letters are always valid in Python identifiers.
"[": "LSQB",
"\\": "BSOL",
"]": "RSQB",
"^": "HAT",
# Underscore is valid in Python identifiers.
"`": "GRAVE",
# Small letters are also always valid.
"{": "LCUB",
"|": "VERT",
"}": "RCUB",
# TILDE is fine.
}.items()
}
"""Shorter names for Quotez."""
QZ_NAME = {ord(k): ord(v) for k, v in {" ": "x", "-": "h"}.items()}
[docs]def qz_encode(c: str) -> str:
"""
Converts a character to its Quotez encoding,
unless it's already valid in a Python identifier.
"""
if ("x" + c).isidentifier():
return c
return force_qz_encode(c)
[docs]def force_qz_encode(c: str) -> str:
"""
Converts a character to its Quotez encoding,
even if it's valid in a Python identifier.
"""
with suppress(LookupError):
return TO_NAME[c]
with suppress(ValueError):
return QUOTEZ.format(unicodedata.name(c).translate(QZ_NAME))
return QUOTEZ.format(ord(c))
K = TypeVar("K", bound=Hashable)
V = TypeVar("V")
def _inverse_1to1(mapping: Mapping[K, V]) -> Dict[V, K]:
result = {v: k for k, v in mapping.items()}
assert len(mapping) == len(result)
return result
LOOKUP_NAME = _inverse_1to1(TO_NAME)
"""The inverse of `TO_NAME`."""
UN_QZ_NAME = _inverse_1to1(QZ_NAME)
def _qz_decode(match: Match[str]) -> str:
with suppress(KeyError):
return LOOKUP_NAME[match.group()]
with suppress(KeyError):
return unicodedata.lookup(match.group(1).translate(UN_QZ_NAME))
with suppress(ValueError):
return chr(int(match.group(1)))
return match.group()
[docs]def demunge(s: str) -> str:
"""The inverse of `munge`. Decodes any Quotez into characters.
Characters can be encoded in one of three ways:
Short names, Unicode names, and ordinals.
`demunge` will decode any of these, even though `munge` will
consistently pick only one of these for any given character.
`demunge` will also leave the remaining text as-is, along with any
invalid Quotez.
>>> demunge("QzFOO_QzGT_QzHYPHENhMINUS_Qz62_bar")
'QzFOO_>->bar'
"""
return FIND_QUOTEZ.sub(_qz_decode, s)