Source code for bibble.people.name_reader

  1#!/usr/bin/env python3
  2"""
  3
  4See EOF for license/metadata/notes as applicable
  5"""
  6
  7# Imports:
  8from __future__ import annotations
  9
 10# ##-- stdlib imports
 11import datetime
 12import enum
 13import functools as ftz
 14import itertools as itz
 15import logging as logmod
 16import pathlib as pl
 17import re
 18import time
 19import types
 20import weakref
 21from uuid import UUID, uuid1
 22
 23# ##-- end stdlib imports
 24
 25# ##-- 3rd party imports
 26import bibtexparser
 27import bibtexparser.model as model
 28from bibtexparser import middlewares as ms
 29from bibtexparser.middlewares.middleware import (BlockMiddleware,
 30                                                 LibraryMiddleware)
 31from jgdv import Mixin, Proto
 32
 33# ##-- end 3rd party imports
 34
 35# ##-- 1st party imports
 36import bibble._interface as API
 37from bibble.util.mixins import FieldMatcher_m
 38from bibble.util.middlecore import IdenBlockMiddleware
 39from bibble.util.name_parts import NameParts_d
 40
 41# ##-- end 1st party imports
 42
 43import pyparsing as pp
 44from . import _interface as API_N
 45
 46# ##-- types
 47# isort: off
 48import abc
 49import collections.abc
 50from typing import TYPE_CHECKING, cast, assert_type, assert_never
 51from typing import Generic, NewType
 52# Protocols:
 53from typing import Protocol, runtime_checkable
 54# Typing Decorators:
 55from typing import no_type_check, final, override, overload
 56
 57if TYPE_CHECKING:
 58    from jgdv import Maybe, Result
 59    from typing import Final
 60    from typing import ClassVar, Any, LiteralString
 61    from typing import Never, Self, Literal
 62    from typing import TypeGuard
 63    from collections.abc import Iterable, Iterator, Callable, Generator
 64    from collections.abc import Sequence, Mapping, MutableMapping, Hashable
 65
 66    type Block = model.Block
 67    type Field = model.Field
 68    type Entry = model.Entry
 69    from bibtexparser.library import Library
 70    type Parser = pp.core.ParserElement
 71##--|
 72
 73# isort: on
 74# ##-- end types
 75
 76##-- logging
 77logging = logmod.getLogger(__name__)
 78##-- end logging
 79
 80PARSE_STATE = API_N.NameSplitState_e
 81##--|
 82
[docs] 83class _SplitAuthors_m: 84 """ Adapated from bibtexparser's split_multiple_persons_names, originally by Blair Bonnett 85 86 Splits names by intermediary 'and's. 87 Like its original, treats non-breaking space and '~'s as regular chars not whitespace. 88 89 'and's within braces are returned un modified. 90 eg: '{Simon and Schuster}' -> ['{Simon and Schuster}'] 91 92 """ 93
[docs] 94 def _build_split_parser(self) -> Parser: 95 return pp.Literal("and")
96
[docs] 97 def _split_authors(self, val:str, *, strict=True) -> list[str]: 98 return self._split_authors_fsm(val, strict=strict)
99
[docs] 100 def _split_authors_pp(self, val:str, *, strict=True) -> list[str]: 101 """ 102 TODO 103 """ 104 val = val.strip() 105 if not bool(val): 106 return [] 107 108 parser = self._build_split_parser() 109 110 return []
111
[docs] 112 def _split_authors_fsm(self, val:str, *, strict=True) -> list[str]: 113 val = val.strip() 114 if not bool(val): 115 return [] 116 117 # Processing variables. 118 step = PARSE_STATE.start_ws # Current step. 119 pos = 0 # Current position in string. 120 bracelevel = 0 # Current bracelevel. 121 spans = [[0]] # Spans of names within the string. 122 possible_end = 0 # Possible end position of a name. 123 whitespace = API_N.NAME_WHITESPACE # Allowed whitespace characters. 124 125 # Loop over the string. 126 namesiter = iter(val) 127 for char in namesiter: 128 pos += 1 129 match char: 130 case "\\": 131 try: 132 next(namesiter) 133 except StopIteration: 134 # If we're at the end of the string, then the \ is just a \. 135 pass 136 pos += 1 137 continue 138 case API_N.OBRACE: 139 # Change in brace level. 140 if step == API_N.NameSplitState_e.next_word: 141 spans[-1].append(possible_end) 142 spans.append([pos - 1]) 143 144 bracelevel += 1 145 step = API_N.NameSplitState_e.start_ws 146 continue 147 case API_N.CBRACE: 148 if bracelevel: 149 bracelevel -= 1 150 151 step = API_N.NameSplitState_e.start_ws 152 continue 153 case _: 154 pass 155 156 # Ignore everything inside a brace. 157 if bracelevel: 158 step = API_N.NameSplitState_e.start_ws 159 continue 160 161 match step: 162 case API_N.NameSplitState_e.start_ws if char in whitespace: 163 # Looking for a whitespace character to start the ' and '. When we find 164 # one, mark it as the possible end of the previous word. 165 step = API_N.NameSplitState_e.find_a 166 possible_end = pos - 1 167 case API_N.NameSplitState_e.find_a if char in ("a", "A"): 168 # Looking for the letter "a". 169 step = API_N.NameSplitState_e.find_n 170 case API_N.NameSplitState_e.find_a if char not in whitespace: 171 # NB, we can have multiple whitespace characters so we need to handle that here. 172 step = API_N.NameSplitState_e.start_ws 173 case API_N.NameSplitState_e.find_n if char in ("n", "N"): 174 # Looking for the letter n. 175 step = API_N.NameSplitState_e.find_d 176 case API_N.NameSplitState_e.find_n if char in whitespace: 177 step = API_N.NameSplitState_e.find_a 178 possible_end = pos - 1 179 case API_N.NameSplitState_e.find_n: 180 step = API_N.NameSplitState_e.start_ws 181 case API_N.NameSplitState_e.find_d if char in ("d", "D"): 182 # Looking for the letter d. 183 step = API_N.NameSplitState_e.end_ws 184 case API_N.NameSplitState_e.find_d if char in whitespace: 185 step = API_N.NameSplitState_e.find_a 186 possible_end = pos - 1 187 case API_N.NameSplitState_e.find_d: 188 step = API_N.NameSplitState_e.start_ws 189 case API_N.NameSplitState_e.end_ws if char in whitespace: 190 # And now the whitespace to end the ' and '. 191 step = API_N.NameSplitState_e.next_word 192 case API_N.NameSplitState_e.end_ws: 193 step = API_N.NameSplitState_e.start_ws 194 case API_N.NameSplitState_e.next_word if char not in whitespace: 195 # Again, we need to handle multiple whitespace characters. Keep going 196 # until we find the start of the next word. 197 # Finish the previous word span, start the next, 198 # and do it all again. 199 spans[-1].append(possible_end) 200 spans.append([pos - 1]) 201 step = API_N.NameSplitState_e.start_ws 202 203 # Finish the last word. 204 205 spans[-1].append(None) 206 207 # Extract and return the names. 208 return [val[start:end] for start, end in spans]
209
[docs] 210class _NameToParts_m: 211 """ Adapted from bibtexparser's parse_single_name_into_parts, originally by Blair Bonnett. 212 213 Parses an individual name into a NameParts_d, a simple data structure containing: 214 - first : list. First names. 215 - von : list. 216 - last : list. Last Names. 217 - jr : list. 218 219 Bibtex Names are of one of the forms: 220 - first von last 221 - von last, first 222 - von last, jr, first 223 """ 224
[docs] 225 def _build_parts_parser(self) -> Parser: 226 return pp.Literal("and")
227
[docs] 228 def _name_to_parts(self, val:str, *, strict=True) -> NameParts_d: 229 val = val.strip() 230 if not bool(val): 231 return NameParts_d() 232 233 sections, cases = self._parse_name_fsm(val, strict=strict) 234 # No non-whitespace input. 235 if not sections or not any(bool(section) for section in sections): 236 return NameParts_d() 237 238 match sections: 239 case [x]: 240 parts = self._first_von_last(x, cases) 241 case [*xs]: 242 parts = self._von_last_first(xs, cases) 243 244 return parts
245
[docs] 246 def _parse_name_pp(self, val, *, strict=True) -> tuple[list, list]: 247 """ TODO """ 248 return [], []
249
[docs] 250 def _parse_name_fsm(self, val, *, strict=True) -> tuple[list, list]: 251 # We'll iterate over the input once, dividing it into a list of words for 252 # each comma-separated section. We'll also calculate the case of each word 253 # as we work. 254 sections = [[]] # Sections of the name. 255 cases = [[]] # 1 = uppercase, 0 = lowercase, -1 = caseless. 256 word = [] # Current word. 257 case = -1 # Case of the current word. 258 level = 0 # Current brace level. 259 bracestart = False # Will the next character be the first within a brace? 260 controlseq = True # Are we currently processing a control sequence? 261 specialchar = None # Are we currently processing a special character? 262 whitespace = API_N.NAME_WHITESPACE 263 264 # Using an iterator allows us to deal with escapes in a simple manner. 265 nameiter = iter(val) 266 for char in nameiter: 267 # An escape. 268 match char: 269 case "\\": 270 try: 271 escaped = next(nameiter) 272 # BibTeX doesn't allow whitespace escaping. Copy the slash and fall 273 # through to the normal case to handle the whitespace. 274 if escaped in whitespace: 275 word.append(char) 276 char = escaped 277 else: 278 if bracestart: 279 # Is this the first character in a brace? 280 bracestart = False 281 controlseq = escaped.isalpha() 282 specialchar = True 283 # Can we use it to determine the case? 284 elif (case == -1) and escaped.isalpha(): 285 if escaped.isupper(): 286 case = 1 287 else: 288 case = 0 289 290 # Copy the escape to the current word and go to the next 291 # character in the input. 292 word.append(char) 293 word.append(escaped) 294 continue 295 except StopIteration: 296 # If we're at the end of the string, then the \ is just a \. 297 word.append(char) 298 case API_N.OBRACE: 299 # Start of a braced expression. 300 level += 1 301 word.append(char) 302 bracestart = True 303 controlseq = False 304 specialchar = False 305 continue 306 case API_N .CBRACE: 307 # All the below cases imply this (and don't test its previous value). 308 bracestart = False 309 # End of a braced expression. 310 # Check and reduce the level. 311 if level: 312 level -= 1 313 else: 314 if strict: 315 raise ValueError(name=val, reason="Unmatched closing brace") 316 word.insert(0, "{") 317 318 # Update the state, append the character, and move on. 319 controlseq = False 320 specialchar = False 321 word.append(char) 322 continue 323 case _ if level: 324 # All the below cases imply this (and don't test its previous value). 325 bracestart = False 326 # Inside a braced expression. 327 # Is this the end of a control sequence? 328 if controlseq: 329 if not char.isalpha(): 330 controlseq = False 331 # If it's a special character, can we use it for a case? 332 elif specialchar: 333 if (case == -1) and char.isalpha(): 334 if char.isupper(): 335 case = 1 336 else: 337 case = 0 338 339 # Append the character and move on. 340 word.append(char) 341 continue 342 343 # End of a word. 344 # NB. we know we're not in a brace here due to the previous case. 345 case x if x == "," or x in whitespace: 346 # All the below cases imply this (and don't test its previous value). 347 bracestart = False 348 # Don't add empty words due to repeated whitespace. 349 if word: 350 sections[-1].append("".join(word)) 351 word = [] 352 cases[-1].append(case) 353 case = -1 354 controlseq = False 355 specialchar = False 356 357 # End of a section. 358 if char == ",": 359 if len(sections) < 3: 360 sections.append([]) 361 cases.append([]) 362 elif strict: 363 raise ValueError(name=val, reason="Too many commas") 364 continue 365 ##--| 366 # Regular character. 367 word.append(char) 368 if (case == -1) and char.isalpha(): 369 if char.isupper(): 370 case = 1 371 else: 372 case = 0 373 else: 374 pass 375 ##--| 376 # Unterminated brace? 377 if level: 378 if strict: 379 raise ValueError(name=val, reason="Unterminated opening brace") 380 while level: 381 word.append(API_N.CBRACE) 382 level -= 1 383 384 # Handle the final word. 385 if word: 386 sections[-1].append("".join(word)) 387 cases[-1].append(case) 388 389 # Get rid of trailing sections. 390 if not sections[-1]: 391 # Trailing comma? 392 if (len(sections) > 1) and strict: 393 raise ValueError(name=val, reason="Trailing comma at end of name") 394 sections.pop(-1) 395 cases.pop(-1) 396 397 return sections, cases
398
[docs] 399 def _first_von_last(self, p0, cases) -> NameParts_d: 400 # Form 2: "First von Last" 401 parts = NameParts_d() 402 403 # One word only: last cannot be empty. 404 if len(p0) == 1: 405 parts.last = p0 406 407 # Two words: must be first and last. 408 elif len(p0) == 2: 409 parts.first = p0[:1] 410 parts.last = p0[1:] 411 412 # Need to use the cases to figure it out. 413 else: 414 cases = cases[0] 415 416 # - First is the longest sequence of words starting with uppercase 417 # that is not the whole string. 418 # - von is then the longest sequence # whose last word starts with 419 # lowercase that is not the whole # string. 420 # - Last is the rest. 421 # NB., this means last cannot be empty. 422 423 # At least one lowercase letter. 424 if 0 in cases: 425 # Index from end of list of first and last lowercase word. 426 firstl = cases.index(0) - len(cases) 427 lastl = -cases[::-1].index(0) - 1 428 if lastl == -1: 429 lastl -= 1 # Cannot consume the rest of the string. 430 431 # Pull the parts out. 432 parts.first = p0[:firstl] 433 parts.von = p0[firstl : lastl + 1] 434 parts.last = p0[lastl + 1 :] 435 436 # No lowercase: last is the last word, first is everything else. 437 else: 438 parts.first = p0[:-1] 439 parts.last = p0[-1:] 440 ##--| 441 return parts
442
[docs] 443 def _von_last_first(self, sections, cases) -> NameParts_d: 444 # Form 2 ("von Last, First") or 3 ("von Last, jr, First") 445 # As long as there is content in the first name partition, use it as-is. 446 parts = NameParts_d() 447 first = sections[-1] 448 if first and first[0]: 449 parts.first = first 450 451 # And again with the jr part. 452 if len(sections) == 3: 453 jr = sections[-2] 454 if jr and jr[0]: 455 parts.jr = jr 456 457 # Last name cannot be empty; if there is only one word in the first 458 # partition, we have to use it for the last name. 459 last = sections[0] 460 if len(last) == 1: 461 parts.last = last 462 return parts 463 464 # Have to look at the cases to figure it out. 465 lcases = cases[0] 466 467 def rindex(k, x, default): 468 """Returns the index of the rightmost occurrence of x in k.""" 469 for i in range(len(k) - 1, -1, -1): 470 if k[i] == x: 471 return i 472 return default 473 474 # Check if at least one of the words is lowercase 475 if 0 in lcases: 476 # Excluding the last word, find the index of the last lower word 477 split = rindex(lcases[:-1], 0, -1) + 1 478 parts.von = sections[0][:split] 479 parts.last = sections[0][split:] 480 481 # All uppercase => all last. 482 else: 483 parts.last = sections[0] 484 485 ##--| 486 return parts
487 488##--| 489
[docs] 490@Proto(API.ReadTime_p) 491@Mixin(FieldMatcher_m, _SplitAuthors_m, _NameToParts_m) 492class NameReader(IdenBlockMiddleware): 493 """ A Refactored version of bibtexparser's SplitNameParts and SeparateCoAuthors 494 """ 495 _whitelist = ("author", "editor", "translator") 496 497 def __init__(self, *, parts:bool=True, authors:bool=True, **kwargs): 498 super().__init__(**kwargs) 499 self._do_split_authors = authors 500 self._do_name_parts = parts 501 self.set_field_matchers(white=self._whitelist, black=[]) 502 if self._do_name_parts and not self._do_split_authors: 503 raise ValueError("Can't generate name parts if you don't split authors") 504
[docs] 505 def on_read(self): 506 Never()
507
[docs] 508 def transform_Entry(self, entry:Entry, library:Library) -> list[Entry]: 509 match self.match_on_fields(entry, library): 510 case model.Entry() as x: 511 return [x] 512 case Exception() as err: 513 return [self.make_error_block(entry, err)] 514 case x: 515 raise TypeError(type(x))
516
[docs] 517 def field_h(self, field:Field, entry:Entry) -> Result[list[Field], Exception]: 518 result = [] 519 match self._do_split_authors: 520 case True: 521 authors = self._split_authors(field.value) 522 case False: 523 authors = field.value 524 case x: 525 raise TypeError(type(x)) 526 527 match authors: 528 case str(): 529 pass 530 case [*xs] if self._do_name_parts: 531 parts = [self._name_to_parts(x) for x in xs] 532 result.append(model.Field(field.key, parts)) 533 case [*xs]: 534 result.append(model.Field(field.key, list(xs))) 535 case x: 536 raise TypeError(type(x)) 537 538 return result