1#!/usr/bin/env python3
2"""
3
4See EOF for license/metadata/notes as applicable
5"""
6
7# Imports:
8from __future__ import annotations
9
10# ##-- stdlib imports
11import datetime
12import enum
13import functools as ftz
14import itertools as itz
15import logging as logmod
16import pathlib as pl
17import re
18import time
19import types
20import weakref
21from uuid import UUID, uuid1
22
23# ##-- end stdlib imports
24
25# ##-- 3rd party imports
26import bibtexparser
27import bibtexparser.model as model
28from bibtexparser import middlewares as ms
29from bibtexparser.middlewares.middleware import (BlockMiddleware,
30 LibraryMiddleware)
31from jgdv import Mixin, Proto
32
33# ##-- end 3rd party imports
34
35# ##-- 1st party imports
36import bibble._interface as API
37from bibble.util.mixins import FieldMatcher_m
38from bibble.util.middlecore import IdenBlockMiddleware
39from bibble.util.name_parts import NameParts_d
40
41# ##-- end 1st party imports
42
43import pyparsing as pp
44from . import _interface as API_N
45
46# ##-- types
47# isort: off
48import abc
49import collections.abc
50from typing import TYPE_CHECKING, cast, assert_type, assert_never
51from typing import Generic, NewType
52# Protocols:
53from typing import Protocol, runtime_checkable
54# Typing Decorators:
55from typing import no_type_check, final, override, overload
56
57if TYPE_CHECKING:
58 from jgdv import Maybe, Result
59 from typing import Final
60 from typing import ClassVar, Any, LiteralString
61 from typing import Never, Self, Literal
62 from typing import TypeGuard
63 from collections.abc import Iterable, Iterator, Callable, Generator
64 from collections.abc import Sequence, Mapping, MutableMapping, Hashable
65
66 type Block = model.Block
67 type Field = model.Field
68 type Entry = model.Entry
69 from bibtexparser.library import Library
70 type Parser = pp.core.ParserElement
71##--|
72
73# isort: on
74# ##-- end types
75
76##-- logging
77logging = logmod.getLogger(__name__)
78##-- end logging
79
80PARSE_STATE = API_N.NameSplitState_e
81##--|
82
[docs]
83class _SplitAuthors_m:
84 """ Adapated from bibtexparser's split_multiple_persons_names, originally by Blair Bonnett
85
86 Splits names by intermediary 'and's.
87 Like its original, treats non-breaking space and '~'s as regular chars not whitespace.
88
89 'and's within braces are returned un modified.
90 eg: '{Simon and Schuster}' -> ['{Simon and Schuster}']
91
92 """
93
[docs]
94 def _build_split_parser(self) -> Parser:
95 return pp.Literal("and")
96
[docs]
97 def _split_authors(self, val:str, *, strict=True) -> list[str]:
98 return self._split_authors_fsm(val, strict=strict)
99
[docs]
100 def _split_authors_pp(self, val:str, *, strict=True) -> list[str]:
101 """
102 TODO
103 """
104 val = val.strip()
105 if not bool(val):
106 return []
107
108 parser = self._build_split_parser()
109
110 return []
111
[docs]
112 def _split_authors_fsm(self, val:str, *, strict=True) -> list[str]:
113 val = val.strip()
114 if not bool(val):
115 return []
116
117 # Processing variables.
118 step = PARSE_STATE.start_ws # Current step.
119 pos = 0 # Current position in string.
120 bracelevel = 0 # Current bracelevel.
121 spans = [[0]] # Spans of names within the string.
122 possible_end = 0 # Possible end position of a name.
123 whitespace = API_N.NAME_WHITESPACE # Allowed whitespace characters.
124
125 # Loop over the string.
126 namesiter = iter(val)
127 for char in namesiter:
128 pos += 1
129 match char:
130 case "\\":
131 try:
132 next(namesiter)
133 except StopIteration:
134 # If we're at the end of the string, then the \ is just a \.
135 pass
136 pos += 1
137 continue
138 case API_N.OBRACE:
139 # Change in brace level.
140 if step == API_N.NameSplitState_e.next_word:
141 spans[-1].append(possible_end)
142 spans.append([pos - 1])
143
144 bracelevel += 1
145 step = API_N.NameSplitState_e.start_ws
146 continue
147 case API_N.CBRACE:
148 if bracelevel:
149 bracelevel -= 1
150
151 step = API_N.NameSplitState_e.start_ws
152 continue
153 case _:
154 pass
155
156 # Ignore everything inside a brace.
157 if bracelevel:
158 step = API_N.NameSplitState_e.start_ws
159 continue
160
161 match step:
162 case API_N.NameSplitState_e.start_ws if char in whitespace:
163 # Looking for a whitespace character to start the ' and '. When we find
164 # one, mark it as the possible end of the previous word.
165 step = API_N.NameSplitState_e.find_a
166 possible_end = pos - 1
167 case API_N.NameSplitState_e.find_a if char in ("a", "A"):
168 # Looking for the letter "a".
169 step = API_N.NameSplitState_e.find_n
170 case API_N.NameSplitState_e.find_a if char not in whitespace:
171 # NB, we can have multiple whitespace characters so we need to handle that here.
172 step = API_N.NameSplitState_e.start_ws
173 case API_N.NameSplitState_e.find_n if char in ("n", "N"):
174 # Looking for the letter n.
175 step = API_N.NameSplitState_e.find_d
176 case API_N.NameSplitState_e.find_n if char in whitespace:
177 step = API_N.NameSplitState_e.find_a
178 possible_end = pos - 1
179 case API_N.NameSplitState_e.find_n:
180 step = API_N.NameSplitState_e.start_ws
181 case API_N.NameSplitState_e.find_d if char in ("d", "D"):
182 # Looking for the letter d.
183 step = API_N.NameSplitState_e.end_ws
184 case API_N.NameSplitState_e.find_d if char in whitespace:
185 step = API_N.NameSplitState_e.find_a
186 possible_end = pos - 1
187 case API_N.NameSplitState_e.find_d:
188 step = API_N.NameSplitState_e.start_ws
189 case API_N.NameSplitState_e.end_ws if char in whitespace:
190 # And now the whitespace to end the ' and '.
191 step = API_N.NameSplitState_e.next_word
192 case API_N.NameSplitState_e.end_ws:
193 step = API_N.NameSplitState_e.start_ws
194 case API_N.NameSplitState_e.next_word if char not in whitespace:
195 # Again, we need to handle multiple whitespace characters. Keep going
196 # until we find the start of the next word.
197 # Finish the previous word span, start the next,
198 # and do it all again.
199 spans[-1].append(possible_end)
200 spans.append([pos - 1])
201 step = API_N.NameSplitState_e.start_ws
202
203 # Finish the last word.
204
205 spans[-1].append(None)
206
207 # Extract and return the names.
208 return [val[start:end] for start, end in spans]
209
[docs]
210class _NameToParts_m:
211 """ Adapted from bibtexparser's parse_single_name_into_parts, originally by Blair Bonnett.
212
213 Parses an individual name into a NameParts_d, a simple data structure containing:
214 - first : list. First names.
215 - von : list.
216 - last : list. Last Names.
217 - jr : list.
218
219 Bibtex Names are of one of the forms:
220 - first von last
221 - von last, first
222 - von last, jr, first
223 """
224
[docs]
225 def _build_parts_parser(self) -> Parser:
226 return pp.Literal("and")
227
[docs]
228 def _name_to_parts(self, val:str, *, strict=True) -> NameParts_d:
229 val = val.strip()
230 if not bool(val):
231 return NameParts_d()
232
233 sections, cases = self._parse_name_fsm(val, strict=strict)
234 # No non-whitespace input.
235 if not sections or not any(bool(section) for section in sections):
236 return NameParts_d()
237
238 match sections:
239 case [x]:
240 parts = self._first_von_last(x, cases)
241 case [*xs]:
242 parts = self._von_last_first(xs, cases)
243
244 return parts
245
[docs]
246 def _parse_name_pp(self, val, *, strict=True) -> tuple[list, list]:
247 """ TODO """
248 return [], []
249
[docs]
250 def _parse_name_fsm(self, val, *, strict=True) -> tuple[list, list]:
251 # We'll iterate over the input once, dividing it into a list of words for
252 # each comma-separated section. We'll also calculate the case of each word
253 # as we work.
254 sections = [[]] # Sections of the name.
255 cases = [[]] # 1 = uppercase, 0 = lowercase, -1 = caseless.
256 word = [] # Current word.
257 case = -1 # Case of the current word.
258 level = 0 # Current brace level.
259 bracestart = False # Will the next character be the first within a brace?
260 controlseq = True # Are we currently processing a control sequence?
261 specialchar = None # Are we currently processing a special character?
262 whitespace = API_N.NAME_WHITESPACE
263
264 # Using an iterator allows us to deal with escapes in a simple manner.
265 nameiter = iter(val)
266 for char in nameiter:
267 # An escape.
268 match char:
269 case "\\":
270 try:
271 escaped = next(nameiter)
272 # BibTeX doesn't allow whitespace escaping. Copy the slash and fall
273 # through to the normal case to handle the whitespace.
274 if escaped in whitespace:
275 word.append(char)
276 char = escaped
277 else:
278 if bracestart:
279 # Is this the first character in a brace?
280 bracestart = False
281 controlseq = escaped.isalpha()
282 specialchar = True
283 # Can we use it to determine the case?
284 elif (case == -1) and escaped.isalpha():
285 if escaped.isupper():
286 case = 1
287 else:
288 case = 0
289
290 # Copy the escape to the current word and go to the next
291 # character in the input.
292 word.append(char)
293 word.append(escaped)
294 continue
295 except StopIteration:
296 # If we're at the end of the string, then the \ is just a \.
297 word.append(char)
298 case API_N.OBRACE:
299 # Start of a braced expression.
300 level += 1
301 word.append(char)
302 bracestart = True
303 controlseq = False
304 specialchar = False
305 continue
306 case API_N .CBRACE:
307 # All the below cases imply this (and don't test its previous value).
308 bracestart = False
309 # End of a braced expression.
310 # Check and reduce the level.
311 if level:
312 level -= 1
313 else:
314 if strict:
315 raise ValueError(name=val, reason="Unmatched closing brace")
316 word.insert(0, "{")
317
318 # Update the state, append the character, and move on.
319 controlseq = False
320 specialchar = False
321 word.append(char)
322 continue
323 case _ if level:
324 # All the below cases imply this (and don't test its previous value).
325 bracestart = False
326 # Inside a braced expression.
327 # Is this the end of a control sequence?
328 if controlseq:
329 if not char.isalpha():
330 controlseq = False
331 # If it's a special character, can we use it for a case?
332 elif specialchar:
333 if (case == -1) and char.isalpha():
334 if char.isupper():
335 case = 1
336 else:
337 case = 0
338
339 # Append the character and move on.
340 word.append(char)
341 continue
342
343 # End of a word.
344 # NB. we know we're not in a brace here due to the previous case.
345 case x if x == "," or x in whitespace:
346 # All the below cases imply this (and don't test its previous value).
347 bracestart = False
348 # Don't add empty words due to repeated whitespace.
349 if word:
350 sections[-1].append("".join(word))
351 word = []
352 cases[-1].append(case)
353 case = -1
354 controlseq = False
355 specialchar = False
356
357 # End of a section.
358 if char == ",":
359 if len(sections) < 3:
360 sections.append([])
361 cases.append([])
362 elif strict:
363 raise ValueError(name=val, reason="Too many commas")
364 continue
365 ##--|
366 # Regular character.
367 word.append(char)
368 if (case == -1) and char.isalpha():
369 if char.isupper():
370 case = 1
371 else:
372 case = 0
373 else:
374 pass
375 ##--|
376 # Unterminated brace?
377 if level:
378 if strict:
379 raise ValueError(name=val, reason="Unterminated opening brace")
380 while level:
381 word.append(API_N.CBRACE)
382 level -= 1
383
384 # Handle the final word.
385 if word:
386 sections[-1].append("".join(word))
387 cases[-1].append(case)
388
389 # Get rid of trailing sections.
390 if not sections[-1]:
391 # Trailing comma?
392 if (len(sections) > 1) and strict:
393 raise ValueError(name=val, reason="Trailing comma at end of name")
394 sections.pop(-1)
395 cases.pop(-1)
396
397 return sections, cases
398
[docs]
399 def _first_von_last(self, p0, cases) -> NameParts_d:
400 # Form 2: "First von Last"
401 parts = NameParts_d()
402
403 # One word only: last cannot be empty.
404 if len(p0) == 1:
405 parts.last = p0
406
407 # Two words: must be first and last.
408 elif len(p0) == 2:
409 parts.first = p0[:1]
410 parts.last = p0[1:]
411
412 # Need to use the cases to figure it out.
413 else:
414 cases = cases[0]
415
416 # - First is the longest sequence of words starting with uppercase
417 # that is not the whole string.
418 # - von is then the longest sequence # whose last word starts with
419 # lowercase that is not the whole # string.
420 # - Last is the rest.
421 # NB., this means last cannot be empty.
422
423 # At least one lowercase letter.
424 if 0 in cases:
425 # Index from end of list of first and last lowercase word.
426 firstl = cases.index(0) - len(cases)
427 lastl = -cases[::-1].index(0) - 1
428 if lastl == -1:
429 lastl -= 1 # Cannot consume the rest of the string.
430
431 # Pull the parts out.
432 parts.first = p0[:firstl]
433 parts.von = p0[firstl : lastl + 1]
434 parts.last = p0[lastl + 1 :]
435
436 # No lowercase: last is the last word, first is everything else.
437 else:
438 parts.first = p0[:-1]
439 parts.last = p0[-1:]
440 ##--|
441 return parts
442
[docs]
443 def _von_last_first(self, sections, cases) -> NameParts_d:
444 # Form 2 ("von Last, First") or 3 ("von Last, jr, First")
445 # As long as there is content in the first name partition, use it as-is.
446 parts = NameParts_d()
447 first = sections[-1]
448 if first and first[0]:
449 parts.first = first
450
451 # And again with the jr part.
452 if len(sections) == 3:
453 jr = sections[-2]
454 if jr and jr[0]:
455 parts.jr = jr
456
457 # Last name cannot be empty; if there is only one word in the first
458 # partition, we have to use it for the last name.
459 last = sections[0]
460 if len(last) == 1:
461 parts.last = last
462 return parts
463
464 # Have to look at the cases to figure it out.
465 lcases = cases[0]
466
467 def rindex(k, x, default):
468 """Returns the index of the rightmost occurrence of x in k."""
469 for i in range(len(k) - 1, -1, -1):
470 if k[i] == x:
471 return i
472 return default
473
474 # Check if at least one of the words is lowercase
475 if 0 in lcases:
476 # Excluding the last word, find the index of the last lower word
477 split = rindex(lcases[:-1], 0, -1) + 1
478 parts.von = sections[0][:split]
479 parts.last = sections[0][split:]
480
481 # All uppercase => all last.
482 else:
483 parts.last = sections[0]
484
485 ##--|
486 return parts
487
488##--|
489
[docs]
490@Proto(API.ReadTime_p)
491@Mixin(FieldMatcher_m, _SplitAuthors_m, _NameToParts_m)
492class NameReader(IdenBlockMiddleware):
493 """ A Refactored version of bibtexparser's SplitNameParts and SeparateCoAuthors
494 """
495 _whitelist = ("author", "editor", "translator")
496
497 def __init__(self, *, parts:bool=True, authors:bool=True, **kwargs):
498 super().__init__(**kwargs)
499 self._do_split_authors = authors
500 self._do_name_parts = parts
501 self.set_field_matchers(white=self._whitelist, black=[])
502 if self._do_name_parts and not self._do_split_authors:
503 raise ValueError("Can't generate name parts if you don't split authors")
504
[docs]
505 def on_read(self):
506 Never()
507
[docs]
508 def transform_Entry(self, entry:Entry, library:Library) -> list[Entry]:
509 match self.match_on_fields(entry, library):
510 case model.Entry() as x:
511 return [x]
512 case Exception() as err:
513 return [self.make_error_block(entry, err)]
514 case x:
515 raise TypeError(type(x))
516
[docs]
517 def field_h(self, field:Field, entry:Entry) -> Result[list[Field], Exception]:
518 result = []
519 match self._do_split_authors:
520 case True:
521 authors = self._split_authors(field.value)
522 case False:
523 authors = field.value
524 case x:
525 raise TypeError(type(x))
526
527 match authors:
528 case str():
529 pass
530 case [*xs] if self._do_name_parts:
531 parts = [self._name_to_parts(x) for x in xs]
532 result.append(model.Field(field.key, parts))
533 case [*xs]:
534 result.append(model.Field(field.key, list(xs)))
535 case x:
536 raise TypeError(type(x))
537
538 return result