Source code for bibble.people.name_reader

  1#!/usr/bin/env python3
  2"""
  3
  4See EOF for license/metadata/notes as applicable
  5"""
  6
  7# Imports:
  8from __future__ import annotations
  9
 10# ##-- stdlib imports
 11import datetime
 12import enum
 13import functools as ftz
 14import itertools as itz
 15import logging as logmod
 16import pathlib as pl
 17import re
 18import time
 19import types
 20import weakref
 21from uuid import UUID, uuid1
 22
 23# ##-- end stdlib imports
 24
 25# ##-- 3rd party imports
 26import bibtexparser
 27import bibtexparser.model as model
 28from bibtexparser import middlewares as ms
 29from bibtexparser.middlewares.middleware import (BlockMiddleware,
 30                                                 LibraryMiddleware)
 31from jgdv import Mixin, Proto
 32
 33# ##-- end 3rd party imports
 34
 35# ##-- 1st party imports
 36import bibble._interface as API
 37from bibble.util.mixins import FieldMatcher_m
 38from bibble.util.middlecore import IdenBlockMiddleware
 39from bibble.util.name_parts import NameParts_d
 40
 41# ##-- end 1st party imports
 42
 43import pyparsing as pp
 44from . import _interface as API_N
 45
 46# ##-- types
 47# isort: off
 48import abc
 49import collections.abc
 50from typing import TYPE_CHECKING, cast, assert_type, assert_never
 51from typing import Generic, NewType
 52# Protocols:
 53from typing import Protocol, runtime_checkable
 54# Typing Decorators:
 55from typing import no_type_check, final, override, overload
 56
 57if TYPE_CHECKING:
 58    from jgdv import Maybe, Result
 59    from typing import Final
 60    from typing import ClassVar, Any, LiteralString
 61    from typing import Never, Self, Literal
 62    from typing import TypeGuard
 63    from collections.abc import Iterable, Iterator, Callable, Generator
 64    from collections.abc import Sequence, Mapping, MutableMapping, Hashable
 65
 66    type Block = model.Block
 67    type Field = model.Field
 68    type Entry = model.Entry
 69    from bibtexparser.library import Library
 70    type Parser = pp.core.ParserElement
 71##--|
 72
 73# isort: on
 74# ##-- end types
 75
 76##-- logging
 77logging = logmod.getLogger(__name__)
 78##-- end logging
 79
 80PARSE_STATE = API_N.NameSplitState_e
 81##--|
 82

[docs]
 83class _SplitAuthors_m:
 84    """ Adapated from bibtexparser's split_multiple_persons_names, originally by Blair Bonnett
 85
 86    Splits names by intermediary 'and's.
 87    Like its original, treats non-breaking space and '~'s as regular chars not whitespace.
 88
 89    'and's within braces are returned un modified.
 90    eg: '{Simon and Schuster}' -> ['{Simon and Schuster}']
 91
 92    """
 93

[docs]
 94    def _build_split_parser(self) -> Parser:
 95        return pp.Literal("and")

 96

[docs]
 97    def _split_authors(self, val:str, *, strict=True) -> list[str]:
 98        return self._split_authors_fsm(val, strict=strict)

 99

[docs]
100    def _split_authors_pp(self, val:str, *, strict=True) -> list[str]:
101        """
102        TODO
103        """
104        val = val.strip()
105        if not bool(val):
106            return []
107
108        parser = self._build_split_parser()
109
110        return []

111

[docs]
112    def _split_authors_fsm(self, val:str, *, strict=True) -> list[str]:
113        val = val.strip()
114        if not bool(val):
115            return []
116
117        # Processing variables.
118        step         = PARSE_STATE.start_ws  # Current step.
119        pos          = 0  # Current position in string.
120        bracelevel   = 0  # Current bracelevel.
121        spans        = [[0]]  # Spans of names within the string.
122        possible_end = 0  # Possible end position of a name.
123        whitespace   = API_N.NAME_WHITESPACE # Allowed whitespace characters.
124
125        # Loop over the string.
126        namesiter = iter(val)
127        for char in namesiter:
128            pos += 1
129            match char:
130                case "\\":
131                    try:
132                        next(namesiter)
133                    except StopIteration:
134                        # If we're at the end of the string, then the \ is just a \.
135                        pass
136                    pos += 1
137                    continue
138                case API_N.OBRACE:
139                    # Change in brace level.
140                    if step == API_N.NameSplitState_e.next_word:
141                        spans[-1].append(possible_end)
142                        spans.append([pos - 1])
143
144                    bracelevel += 1
145                    step = API_N.NameSplitState_e.start_ws
146                    continue
147                case API_N.CBRACE:
148                    if bracelevel:
149                        bracelevel -= 1
150
151                    step = API_N.NameSplitState_e.start_ws
152                    continue
153                case _:
154                    pass
155
156            # Ignore everything inside a brace.
157            if bracelevel:
158                step = API_N.NameSplitState_e.start_ws
159                continue
160
161            match step:
162                case API_N.NameSplitState_e.start_ws if char in whitespace:
163                    # Looking for a whitespace character to start the ' and '. When we find
164                    # one, mark it as the possible end of the previous word.
165                    step = API_N.NameSplitState_e.find_a
166                    possible_end = pos - 1
167                case API_N.NameSplitState_e.find_a if char in ("a", "A"):
168                    # Looking for the letter "a".
169                    step = API_N.NameSplitState_e.find_n
170                case API_N.NameSplitState_e.find_a if char not in whitespace:
171                    # NB, we can have multiple whitespace characters so we need to handle that here.
172                    step = API_N.NameSplitState_e.start_ws
173                case API_N.NameSplitState_e.find_n if char in ("n", "N"):
174                    # Looking for the letter n.
175                    step = API_N.NameSplitState_e.find_d
176                case API_N.NameSplitState_e.find_n if char in whitespace:
177                    step = API_N.NameSplitState_e.find_a
178                    possible_end = pos - 1
179                case API_N.NameSplitState_e.find_n:
180                    step = API_N.NameSplitState_e.start_ws
181                case API_N.NameSplitState_e.find_d if char in ("d", "D"):
182                    # Looking for the letter d.
183                    step = API_N.NameSplitState_e.end_ws
184                case API_N.NameSplitState_e.find_d if char in whitespace:
185                    step = API_N.NameSplitState_e.find_a
186                    possible_end = pos - 1
187                case API_N.NameSplitState_e.find_d:
188                    step = API_N.NameSplitState_e.start_ws
189                case API_N.NameSplitState_e.end_ws if char in whitespace:
190                    # And now the whitespace to end the ' and '.
191                    step = API_N.NameSplitState_e.next_word
192                case API_N.NameSplitState_e.end_ws:
193                    step = API_N.NameSplitState_e.start_ws
194                case API_N.NameSplitState_e.next_word if char not in whitespace:
195                    # Again, we need to handle multiple whitespace characters. Keep going
196                    # until we find the start of the next word.
197                    # Finish the previous word span, start the next,
198                    # and do it all again.
199                    spans[-1].append(possible_end)
200                    spans.append([pos - 1])
201                    step = API_N.NameSplitState_e.start_ws
202
203        # Finish the last word.
204
205        spans[-1].append(None)
206
207        # Extract and return the names.
208        return [val[start:end] for start, end in spans]


209

[docs]
210class _NameToParts_m:
211    """ Adapted from bibtexparser's parse_single_name_into_parts, originally by Blair Bonnett.
212
213    Parses an individual name into a NameParts_d, a simple data structure containing:
214    - first : list. First names.
215    - von   : list.
216    - last  : list. Last Names.
217    - jr    : list.
218
219    Bibtex Names are of one of the forms:
220    - first von last
221    - von last, first
222    - von last, jr, first
223    """
224

[docs]
225    def _build_parts_parser(self) -> Parser:
226        return pp.Literal("and")

227

[docs]
228    def _name_to_parts(self, val:str, *, strict=True) -> NameParts_d:
229        val = val.strip()
230        if not bool(val):
231            return NameParts_d()
232
233        sections, cases = self._parse_name_fsm(val, strict=strict)
234        # No non-whitespace input.
235        if not sections or not any(bool(section) for section in sections):
236            return NameParts_d()
237
238        match sections:
239            case [x]:
240                parts = self._first_von_last(x, cases)
241            case [*xs]:
242                parts = self._von_last_first(xs, cases)
243
244        return parts

245

[docs]
246    def _parse_name_pp(self, val, *, strict=True) -> tuple[list, list]:
247        """ TODO """
248        return [], []

249

[docs]
250    def _parse_name_fsm(self, val, *, strict=True) -> tuple[list, list]:
251        # We'll iterate over the input once, dividing it into a list of words for
252        # each comma-separated section. We'll also calculate the case of each word
253        # as we work.
254        sections    = [[]]   # Sections of the name.
255        cases       = [[]]   # 1 = uppercase, 0 = lowercase, -1 = caseless.
256        word        = []     # Current word.
257        case        = -1     # Case of the current word.
258        level       = 0      # Current brace level.
259        bracestart  = False  # Will the next character be the first within a brace?
260        controlseq  = True   # Are we currently processing a control sequence?
261        specialchar = None   # Are we currently processing a special character?
262        whitespace  = API_N.NAME_WHITESPACE
263
264        # Using an iterator allows us to deal with escapes in a simple manner.
265        nameiter = iter(val)
266        for char in nameiter:
267            # An escape.
268            match char:
269                case "\\":
270                    try:
271                        escaped = next(nameiter)
272                        # BibTeX doesn't allow whitespace escaping. Copy the slash and fall
273                        # through to the normal case to handle the whitespace.
274                        if escaped in whitespace:
275                            word.append(char)
276                            char = escaped
277                        else:
278                            if bracestart:
279                                # Is this the first character in a brace?
280                                bracestart = False
281                                controlseq = escaped.isalpha()
282                                specialchar = True
283                                # Can we use it to determine the case?
284                            elif (case == -1) and escaped.isalpha():
285                                if escaped.isupper():
286                                    case = 1
287                                else:
288                                    case = 0
289
290                            # Copy the escape to the current word and go to the next
291                            # character in the input.
292                            word.append(char)
293                            word.append(escaped)
294                            continue
295                    except StopIteration:
296                        # If we're at the end of the string, then the \ is just a \.
297                        word.append(char)
298                case API_N.OBRACE:
299                    # Start of a braced expression.
300                    level += 1
301                    word.append(char)
302                    bracestart = True
303                    controlseq = False
304                    specialchar = False
305                    continue
306                case API_N .CBRACE:
307                    # All the below cases imply this (and don't test its previous value).
308                    bracestart = False
309                    # End of a braced expression.
310                    # Check and reduce the level.
311                    if level:
312                        level -= 1
313                    else:
314                        if strict:
315                            raise ValueError(name=val, reason="Unmatched closing brace")
316                        word.insert(0, "{")
317
318                    # Update the state, append the character, and move on.
319                    controlseq = False
320                    specialchar = False
321                    word.append(char)
322                    continue
323                case _ if level:
324                    # All the below cases imply this (and don't test its previous value).
325                    bracestart = False
326                    # Inside a braced expression.
327                    # Is this the end of a control sequence?
328                    if controlseq:
329                        if not char.isalpha():
330                            controlseq = False
331                    # If it's a special character, can we use it for a case?
332                    elif specialchar:
333                        if (case == -1) and char.isalpha():
334                            if char.isupper():
335                                case = 1
336                            else:
337                                case = 0
338
339                    # Append the character and move on.
340                    word.append(char)
341                    continue
342
343            # End of a word.
344            # NB. we know we're not in a brace here due to the previous case.
345                case x if x == "," or x in whitespace:
346                    # All the below cases imply this (and don't test its previous value).
347                    bracestart = False
348                    # Don't add empty words due to repeated whitespace.
349                    if word:
350                        sections[-1].append("".join(word))
351                        word = []
352                        cases[-1].append(case)
353                        case = -1
354                        controlseq = False
355                        specialchar = False
356
357                    # End of a section.
358                    if char == ",":
359                        if len(sections) < 3:
360                            sections.append([])
361                            cases.append([])
362                        elif strict:
363                            raise ValueError(name=val, reason="Too many commas")
364                    continue
365            ##--|
366            # Regular character.
367            word.append(char)
368            if (case == -1) and char.isalpha():
369                if char.isupper():
370                    case = 1
371                else:
372                    case = 0
373        else:
374            pass
375        ##--|
376        # Unterminated brace?
377        if level:
378            if strict:
379                raise ValueError(name=val, reason="Unterminated opening brace")
380            while level:
381                word.append(API_N.CBRACE)
382                level -= 1
383
384        # Handle the final word.
385        if word:
386            sections[-1].append("".join(word))
387            cases[-1].append(case)
388
389        # Get rid of trailing sections.
390        if not sections[-1]:
391            # Trailing comma?
392            if (len(sections) > 1) and strict:
393                raise ValueError(name=val, reason="Trailing comma at end of name")
394            sections.pop(-1)
395            cases.pop(-1)
396
397        return sections, cases

398

[docs]
399    def _first_von_last(self, p0, cases) -> NameParts_d:
400        # Form 2: "First von Last"
401        parts = NameParts_d()
402
403        # One word only: last cannot be empty.
404        if len(p0) == 1:
405            parts.last = p0
406
407        # Two words: must be first and last.
408        elif len(p0) == 2:
409            parts.first = p0[:1]
410            parts.last = p0[1:]
411
412        # Need to use the cases to figure it out.
413        else:
414            cases = cases[0]
415
416            # - First is the longest sequence of words starting with uppercase
417            # that is not the whole string.
418            # - von is then the longest sequence # whose last word starts with
419            # lowercase that is not the whole # string.
420            # - Last is the rest.
421            # NB., this means last cannot be empty.
422
423            # At least one lowercase letter.
424            if 0 in cases:
425                # Index from end of list of first and last lowercase word.
426                firstl = cases.index(0) - len(cases)
427                lastl = -cases[::-1].index(0) - 1
428                if lastl == -1:
429                    lastl -= 1  # Cannot consume the rest of the string.
430
431                # Pull the parts out.
432                parts.first = p0[:firstl]
433                parts.von = p0[firstl : lastl + 1]
434                parts.last = p0[lastl + 1 :]
435
436            # No lowercase: last is the last word, first is everything else.
437            else:
438                parts.first = p0[:-1]
439                parts.last = p0[-1:]
440        ##--|
441        return parts

442

[docs]
443    def _von_last_first(self, sections, cases) -> NameParts_d:
444        # Form 2 ("von Last, First") or 3 ("von Last, jr, First")
445        # As long as there is content in the first name partition, use it as-is.
446        parts = NameParts_d()
447        first = sections[-1]
448        if first and first[0]:
449            parts.first = first
450
451        # And again with the jr part.
452        if len(sections) == 3:
453            jr = sections[-2]
454            if jr and jr[0]:
455                parts.jr = jr
456
457        # Last name cannot be empty; if there is only one word in the first
458        # partition, we have to use it for the last name.
459        last = sections[0]
460        if len(last) == 1:
461            parts.last = last
462            return parts
463
464        # Have to look at the cases to figure it out.
465        lcases = cases[0]
466
467        def rindex(k, x, default):
468            """Returns the index of the rightmost occurrence of x in k."""
469            for i in range(len(k) - 1, -1, -1):
470                if k[i] == x:
471                    return i
472            return default
473
474        # Check if at least one of the words is lowercase
475        if 0 in lcases:
476            # Excluding the last word, find the index of the last lower word
477            split = rindex(lcases[:-1], 0, -1) + 1
478            parts.von = sections[0][:split]
479            parts.last = sections[0][split:]
480
481        # All uppercase => all last.
482        else:
483            parts.last = sections[0]
484
485        ##--|
486        return parts


487
488##--|
489

[docs]
490@Proto(API.ReadTime_p)
491@Mixin(FieldMatcher_m, _SplitAuthors_m, _NameToParts_m)
492class NameReader(IdenBlockMiddleware):
493    """ A Refactored version of bibtexparser's SplitNameParts and SeparateCoAuthors
494    """
495    _whitelist = ("author", "editor", "translator")
496
497    def __init__(self, *, parts:bool=True, authors:bool=True,  **kwargs):
498        super().__init__(**kwargs)
499        self._do_split_authors = authors
500        self._do_name_parts = parts
501        self.set_field_matchers(white=self._whitelist, black=[])
502        if self._do_name_parts and not self._do_split_authors:
503            raise ValueError("Can't generate name parts if you don't split authors")
504

[docs]
505    def on_read(self):
506        Never()

507

[docs]
508    def transform_Entry(self, entry:Entry, library:Library) -> list[Entry]:
509        match self.match_on_fields(entry, library):
510            case model.Entry() as x:
511                return [x]
512            case Exception() as err:
513                return [self.make_error_block(entry, err)]
514            case x:
515                raise TypeError(type(x))

516

[docs]
517    def field_h(self, field:Field, entry:Entry) -> Result[list[Field], Exception]:
518        result = []
519        match self._do_split_authors:
520            case True:
521                authors = self._split_authors(field.value)
522            case False:
523                authors = field.value
524            case x:
525                raise TypeError(type(x))
526
527        match authors:
528            case str():
529                pass
530            case [*xs] if self._do_name_parts:
531                parts = [self._name_to_parts(x) for x in xs]
532                result.append(model.Field(field.key, parts))
533            case [*xs]:
534                result.append(model.Field(field.key, list(xs)))
535            case x:
536                raise TypeError(type(x))
537
538        return result