# Copyright 2019, 2020, 2021, 2022, 2023, 2024 Matthew Egan Odendahl
# SPDX-License-Identifier: Apache-2.0
R"""
The Lissp language reader and associated helper functions.
The reader is organized as a lexer and parser.
The parser is extensible with Lissp `tag`\ s.
The lexer is not extensible,
and doesn't do much more than pull tokens from a relatively simple regex
and track its position for error messages.
"""
import ast
import builtins
import hashlib
import re
from base64 import b32encode
from collections import namedtuple
from collections.abc import Iterable, Iterator
from contextlib import contextmanager, suppress
from functools import reduce
from importlib import import_module, resources
from itertools import chain
from keyword import iskeyword as _iskeyword
from pathlib import Path, PurePath
from pprint import pformat
from typing import (
Any,
Callable as Fn,
Literal,
NamedTuple,
NewType,
NoReturn,
TypeGuard,
cast,
)
import hissp.compiler as C
from hissp.compiler import Env
from hissp.munger import force_qz_encode, munge
GENSYM_BYTES = 5
"""
The number of bytes `gensym` hashes have.
The default 5 bytes (40 bits) should be more than sufficient space to
eliminate collisions with typical usage: dozens of gensyms in the same
scope would have a less than a one-in-a-billion chance of collision,
even assuming they all have the same suffix. (Even 3 bytes gets that
number down to around one in ten thousand.)
For unusual applications (if more than dozens of gensyms are expected in
a shared scope, or one in a billion is still too high), hash length can
be increased, up to a maximum of 32 bytes.
Even 8 bytes is enough space for a hundred thousand gensyms in the
same scope with similar collision probability, or dozens with a
one-in-quadrillion chance, which is probably lower than the risk of a
hardware failure. It's unlikely you'll ever need more than 16 bytes,
which has more space than `uuid.uuid4`.
Each hash character encodes 5 bits (`Base32 <base64.b32encode>`),
so a multiple of 5 is recommended, although 3, 8, or 13 bytes are also
fairly efficient for their size:
===== ===== ====== ===================================
bytes bits chars example
===== ===== ====== ===================================
3 24 5 ``Qzthink__G``
5 40 8 ``Qzthinking__G``
8 64 13 ``Qzinvestigation__G``
10 80 16 ``Qzincomprehensible__G``
13 104 21 ``Qzelectroencephalograph__G``
15 120 24 ``Qzmagneticresonanceimaging__G``
16 128 26 ``Qzpositronemissiontomography__G``
===== ===== ====== ===================================
"""
TOKENS = re.compile(
r"""(?x)
(?P<whitespace>[\n ]+)
|(?P<comment>(?:[ ]*;.*\n)+)
|(?P<badspace>\s) # Other whitespace not allowed.
|(?P<open> [(])
|(?P<close>[)])
|(?P<template>`)
|(?P<unquote>,@?)
|(?P<quote>')
|(?P<inject>[.][#])
|(?P<discard> _[#])
|(?P<gensym>[$][#])
|(?P<stararg>[*][*]?=)
|(?P<kwarg>(?:\\.|[^\\ \n"|();#])*
(?:\\.|\w) # Character before = must be alnum, or escaped.
=)
|(?P<tag> (?:\\.|[^\\ \n"|();#])*
(?:\\.|[^\\ \n"|();#.])
[#]+)
|(?P<unicode>
" # Open quote.
(?:[^"\\] # Any non-magic character.
|\\(?:.|\n) # Backslash only if paired, including with newline.
)* # Zero or more times.
" # Close quote.
)
|(?P<fragment>
[|] # open
(?:[^|\n] # No newlines or unpaired |.
|[|][|] # | only if paired.
)*
[|] # close
)
|(?P<continue>
" # String not closed.
|;.* # Comment may need another line.
)
|(?P<badfrag>[|]) # No multiline fragments.
|(?P<control>:(?:\\.|[^\\ \n"|();])*)
|(?P<bare> (?:\\.|[^\\ \n"|();])+)
|(?P<error>.)
"""
)
Token = NewType("Token", tuple[str, str, int])
[docs]class SoftSyntaxError(SyntaxError):
"""A syntax error that could be corrected with more lines of input.
When the REPL encounters this when attempting to evaluate a form,
it will ask for more lines, rather than aborting with an error.
"""
[docs]class Lexer(Iterator):
"""The tokenizer for the Lissp language.
Most of the actual tokenizing is done by the regex.
The Lexer adds some position tracking to that to help with error
messages.
"""
def __init__(self, code: str, file: str = "<?>") -> None:
self.code = code
self.file = file
self.it = self._it()
def __iter__(self) -> Iterator[Token]:
return self
def __next__(self) -> Token:
return next(self.it)
def _it(self) -> Iterator[Token]:
pos = 0
while pos < len(self.code):
match = TOKENS.match(self.code, pos)
assert match is not None
assert match.lastgroup
assert match.end() > pos, match.groups()
pos = match.end()
yield Token((match.lastgroup, match.group(), pos))
[docs] def position(self, pos: int) -> tuple[str, int, int, str]:
"""
Compute the ``filename``, ``lineno``, ``offset`` and ``text``
for a `SyntaxError`, using the current character offset in code.
"""
good = self.code[0:pos].split("\n")
lineno = len(good)
offset = len(good[-1])
return self.file, lineno, offset, self.code.split("\n")[lineno - 1]
_Unquote = namedtuple("_Unquote", ["target", "value"])
[docs]class Kwarg(NamedTuple):
"""Contains a read-time keyword argument for a `tag`.
Normally made with a `kwarg token`, but can be constructed directly.
"""
k: str
v: Any
[docs]class Lissp:
"""The Lissp Reader
Wraps around a Hissp compiler instance and creates a Lissp parser.
"""
def __init__(
self,
qualname: str = "__main__",
env: Env | None = None,
evaluate: bool = False,
filename: str = "<?>",
):
self._template_count = 0
self.qualname = qualname
self.compiler = C.Compiler(self.qualname, env, evaluate)
self.filename = filename
[docs] def template_count(self):
self._template_count += 1
return self._template_count
@property
def env(self) -> Env:
"""The wrapped `Compiler`'s ``env``."""
return self.compiler.env
@env.setter
def env(self, env: Env) -> None:
self.compiler.env = env
[docs] def compile(self, code: str) -> str:
"""Read Lissp code and pass it on to the Hissp compiler."""
hissp = self.reads(code)
return self.compiler.compile(hissp)
[docs] def reads(self, code: str) -> "Parser":
"""Read Hissp forms from code string."""
return self.parse(Lexer(code, self.filename))
[docs] def parse(self, tokens: Lexer) -> "Parser":
"""Read Hissp forms from a Lexer instance."""
return Parser(self, tokens)
[docs]class Parser(Iterator):
R"""
The parser for the Lissp language.
Parses Lissp tokens into Hissp syntax trees.
The `special tag`\ s are handled here. They are
.. list-table::
* - ``'``
- `quote<special>`
* - :literal:`\`` (backtick)
- `template quote` (starts a `template`)
* - ``_#``
- `discard tag`
* - ``.#``
- `inject tag` (evaluate at read time)
Plus the three built-in template helpers, which are only
valid inside a template.
.. list-table::
* - ``,``
- `unquote`
* - ``,@``
- `splice`
* - ``$#``
- `gensym tag`
And finally, the `stararg token` special tags ``*=`` and ``**=``.
Special tags are reserved by the reader and cannot be reassigned.
"""
def __init__(self, lissp: Lissp, tokens: Lexer) -> None:
"""Reset hasher, position, nesting depth, and gensym stacks."""
self.lissp = lissp
self.tokens = tokens
self.counters: list[int] = []
self.context: list[str] = []
self.depth: list[int] = []
self._pos = 0
self.blake = hashlib.blake2s(digest_size=GENSYM_BYTES)
self.blake.update(tokens.code.encode())
self.blake.update(self.lissp.env.get("__name__", "__main__").encode())
def __next__(self):
return next(self._no_comment())
def _no_comment(self) -> Iterator:
return (x for x in self._parse() if not isinstance(x, Comment))
def _parse(self) -> Iterator:
for k, v, self._pos in self.tokens:
# fmt: off
if k == "whitespace": continue
elif k == "comment": yield Comment(v)
elif k == "badspace": raise self._badspace(v)
elif k == "open": yield self._open()
elif k == "close": return self._close()
elif k == "template": yield self._template(v)
elif k == "unquote": yield self._unquote(v)
elif k == "quote": yield "quote", self._pull(v)
elif k == "inject": yield self._inject(v)
elif k == "discard": self._pull(v)
elif k == "gensym": yield self._gensym(self._pull(v))
elif k == "tag": yield self._tag(self._pull(v), v)
elif k == "unicode": yield self._unicode(v)
elif k == "fragment": yield self._fragment(v)
elif k == "continue": raise self._continue()
elif k == "badfrag": raise SyntaxError("unpaired |", self.position())
elif k == "control": yield self.escape(v)
elif k == "bare": yield self.bare(v)
elif k == "error": raise self._error(k)
else: yield Kwarg(v[:-1], self._pull(v)) # kwarg, stararg
# fmt: on
self._check_depth()
def _badspace(self, v: str) -> SyntaxError:
return SyntaxError(
f"{v!r} is not whitespace in Lissp. Indent with spaces only.",
self.position(),
)
[docs] def position(self, index: int | None = None) -> tuple[str, int, int, str]:
"""
Get the ``filename``, ``lineno``, ``offset`` and ``text``
for a `SyntaxError`, from the `Lexer` given to `parse`.
"""
return self.tokens.position(self._pos if index is None else index)
def _open(self) -> tuple:
self.depth.append(self._pos)
return (*self._no_comment(),)
def _close(self) -> None:
if not self.depth:
raise SyntaxError("too many `)`s", self.position())
self.depth.pop()
[docs] @contextmanager
def gensym_context(self):
"""Start a new gensym context for the current template."""
self.counters.append(self.lissp.template_count())
self.context.append("`")
try:
yield
finally:
self.counters.pop()
self.context.pop()
def _unquote(self, v: Literal[",@", ","]) -> _Unquote:
with self.unquote_context():
return _Unquote({",@": ":*", ",": ":?"}[v], self._pull(v, self._pos))
[docs] @contextmanager
def unquote_context(self):
"""Start a new unquote context for the current template."""
self.context.append(",")
if self.context.count(",") > self.context.count("`"):
raise SyntaxError("unquote outside of template", self.position()) from None
try:
yield
finally:
self.context.pop()
def _inject(self, v: str):
with C.macro_context(self.lissp.env):
return eval(
C.readerless(self._pull(v, self._pos), self.lissp.env), self.lissp.env
)
def _pull(self, v: str, p: int | None = None):
if p is None:
p = self._pos
depth = len(self.depth)
try:
return next(self._parse())
except StopIteration:
e = SoftSyntaxError if len(self.depth) == depth else SyntaxError
raise e(f"tag {v!r} missing argument", self.position(p)) from None
def _template(self, v: str):
with self.gensym_context():
return self._template_form(self._pull(v))
def _template_form(self, form):
"""Process form as template."""
case = type(form)
if is_lissp_unicode(form):
return "quote", form
if C.is_node(form):
return ("",":", *chain(*self._template_forms(form)), ":?", "") # fmt: skip
if C.is_str(form) and not form.startswith(":"):
return "quote", self.qualify(form)
if case is _Unquote:
if form.target == ":?":
return form.value
raise SyntaxError("splice not in tuple", self.position())
return form
def _template_forms(self, forms: Iterable) -> Iterable[tuple[str, Any]]:
invocation = True
for form in forms:
case = type(form)
if C.is_str(form) and not form.startswith(":"):
yield ":?", ("quote", self.qualify(form, invocation))
elif case is _Unquote:
yield form
elif C.is_node(form):
yield ":?", self._template_form(form)
else:
yield ":?", form
invocation = False
[docs] def qualify(self, symbol: str, invocation=False) -> str:
"""Qualify symbol based on current context."""
if not is_qualifiable(symbol):
return symbol
env = self.lissp.env
if invocation and C.MACROS in env and hasattr(env[C.MACROS], symbol):
return f"{self.lissp.qualname}..{C.MACROS}.{symbol}" # Known macro.
if symbol in dir(builtins) and symbol.split(".", 1)[0] not in env:
return f"builtins..{symbol}" # Known builtin, not shadowed (yet).
if invocation and "." not in symbol: # Could still be a recursive macro.
return f"{self.lissp.qualname}{C.MAYBE}{symbol}"
return f"{self.lissp.qualname}..{symbol}"
def _gensym(self, form: str) -> str:
"""Generate a symbol unique to the current template.
Re-munges any $'s as a gensym hash, or adds it as a prefix if
there aren't any. Gensym hashes are deterministic for
reproducible builds. Inputs are the code string being read,
the current `__name__` (defaults to "__main__" if not found)
and a count of templates read so far.
"""
blk = self.blake.copy()
blk.update((c := self._get_counter()).to_bytes(1 + c.bit_length() // 8, "big"))
prefix = f"_Qz{b32encode(blk.digest()).rstrip(b'=').lower().decode()}__"
marker = munge("$")
if marker not in form:
return f"{prefix}{form}"
# TODO: escape $'s somehow? $$? \$?
return form.replace(marker, prefix)
def _get_counter(self) -> int:
index = self.context.count("`") - self.context.count(",")
if not self.context or index < 0:
raise SyntaxError("gensym outside of template", self.position()) from None
if self.context[-1] == "`":
return self.counters[-1]
return self.counters[index]
def _tag(self, form, tag: str):
assert tag.endswith("#")
arity = re.sub(r"\\.", "", tag).count("#")
assert arity >= 1
depth_pos = len(self.depth), self._pos
args, kwargs = [], {}
for i, x in enumerate(chain([form], self._parse()), 1):
self._collect(args, kwargs, x)
if i == arity:
break
else:
self._tag_error(tag, *depth_pos)
label = self._label(arity, tag)
fn = self._fully_qualified if ".." in label else self._local
with C.macro_context(self.lissp.env):
return fn(label)(*args, **kwargs)
@classmethod
def _label(cls, arity: int, tag: str) -> str:
label = munge(cls.escape(tag[:-arity]))
return re.sub(r"(^\.)", lambda m: force_qz_encode(m[1]), label)
@classmethod
def _collect(cls, args: list, kwargs: dict, x) -> None:
if type(x) is Kwarg:
k, v = x
if k == "*":
args.extend(v)
elif k == "**":
kwargs.update(dict(v))
else:
kwargs[munge(cls.escape(k))] = v
else:
args.append(x)
def _tag_error(self, tag: str, depth: int, pos: int) -> NoReturn:
e = SoftSyntaxError if len(self.depth) == depth else SyntaxError
msg = f"reader tag {tag!r} missing argument"
raise e(msg, self.position(pos)) from None
@staticmethod
def _fully_qualified(tag: str):
module, function = tag.split("..", 1)
if re.match(rf"{C.MACROS}\.[^.]+$", function):
function += munge("#")
return cast(Fn, reduce(getattr, function.split("."), import_module(module)))
def _local(self, tag: str):
tag = tag.replace(".", force_qz_encode("."))
try:
return getattr(self.lissp.env[C.MACROS], tag + munge("#"))
except (AttributeError, KeyError):
raise SyntaxError(f"unknown tag {tag!r}", self.position())
[docs] @staticmethod
def escape(atom: str) -> str:
"""Process the backslashes in a token."""
return re.sub(
r"\\(.)", lambda m: force_qz_encode(m[1]) if m[1] in ".:" else m[1], atom
)
@staticmethod
def _unicode(v: str) -> str:
v = v.replace("\\\n", "").replace("\n", R"\n")
val = ast.literal_eval(v)
return v if (v := pformat(val)).startswith("(") else f"({v})"
def _fragment(self, v: str) -> str:
return v[1:-1].replace("||", "|")
def _continue(self) -> SoftSyntaxError:
return SoftSyntaxError("Incomplete string token.", self.position())
[docs] @staticmethod
def bare(v: str):
"""Preprocesses a `bare token`. Handles escapes and munging."""
if not v.startswith("\\"):
with suppress(ValueError, SyntaxError):
if not hasattr(x := ast.literal_eval(Parser.escape(v)), "__contains__"):
return x
return munge(Parser.escape(v))
def _error(self, k: str) -> SyntaxError:
assert k == "error", f"unknown token: {k!r}"
return SyntaxError("can't read this", self.position())
def _check_depth(self) -> None:
if self.depth:
raise SoftSyntaxError("form missing a `)`", self.position(self.depth.pop()))
[docs]def is_hissp_string(form: object) -> TypeGuard[str | tuple[Literal["quote"], str]]:
"""Determines if form would directly represent a string in Hissp.
(A `Hissp string`.)
Allows `readerless mode`-style strings: ``('quote','foo',)``
and any `string literal fragment`: ``'"foo"'``
(including the ``"('foo')"`` form produced by the Lissp reader).
Macros often produce strings in one of these forms, via `quote` or
`repr` on a string object.
"""
match form:
case ["quote", x] if C.is_node(form) and C.is_str(x):
return True
return bool(is_string_literal(form))
[docs]def is_lissp_unicode(form: object) -> TypeGuard[str]:
"""
Determines if form could have been read from a Lissp `Unicode token`.
It's not enough to check if the form has a string type.
Several token types such as a `control token`, `symbol token`, or
`fragment token`, read in as a `str atom`. Macros may need to
distinguish these cases.
"""
return C.is_str(form) and form.startswith("(") and bool(is_string_literal(form))
[docs]def is_string_literal(form: object) -> TypeGuard[str]:
"""Determines if `ast.literal_eval` on form produces a string.
(A `string literal fragment`.)
"""
with suppress(Exception):
return C.is_str(ast.literal_eval(form))
return False
[docs]def is_qualifiable(symbol: str) -> bool:
"""Determines if symbol can be qualified with a module.
Can't be ``quote``, ``__import__``, any Python reserved word, a
prefix auto-`gensym`, fully qualified, method syntax, or a `module
handle`; and must be a valid identifier or attribute identifier.
"""
return (
symbol not in {"quote", "__import__"}
and not _iskeyword(symbol)
and not re.match(r"_Qz[a-z2-7]+__", symbol)
and all(map(str.isidentifier, symbol.split(".")))
)
[docs]def transpile(package: str | None, *modules: str) -> None:
"""Transpiles the named Python modules from Lissp.
A ``.lissp`` file of the same name must be present in the module's
location. The Python modules are overwritten. Missing modules are
created. If the package is "" or ``None``, `transpile` writes non-
packaged modules to the current working directory instead.
"""
t = transpile_packaged if package else transpile_file
for m in modules:
t(f"{m}.lissp", package)
[docs]def transpile_packaged(resource: str, package: str) -> None:
"""Locates & transpiles a packaged ``.lissp`` resource file to ``.py``."""
with resources.path(package, resource) as path:
transpile_file(path, package)
[docs]def transpile_file(path: Path | str, package: str | None = None) -> None:
"""Transpiles a single ``.lissp`` file to ``.py`` in the same location.
Code in ``.lissp`` files is executed upon compilation. This is
necessary because macro definitions can alter the compilation of
subsequent top-level forms. A packaged Lissp file must know its
package at compile time to handle templates and macros correctly.
After the ``.py`` file is written, `__file__` will be set to it, if
it doesn't exist already.
"""
path = Path(path).resolve(strict=True)
qualname = f"{package or ''}{'.' if package else ''}{PurePath(path.name).stem}"
L = Lissp(qualname=qualname, evaluate=True, filename=str(path))
python = L.compile(re.sub(r"^#!.*\n", "", path.read_text("utf8")))
(py := path.with_suffix(".py")).write_text(python, "utf8")
L.env.setdefault("__file__", str(py))