Source code for ocebuild.parsers.schema

## @file
# Copyright (c) 2023, The OCE Build Authors. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
##
"""Methods for retrieving and handling Sample.plist schemas."""

from io import TextIOWrapper

from typing import List, Optional, Set, Tuple, Union

from .dict import nested_get, nested_set
from .regex import re_search
from .yaml import parse_yaml_types


################################################################################
#                               LaTeX Parsing Macros                           #
################################################################################

def _extract_command(line: str) -> Union[str, None]:
  """Extracts a LaTeX command from a line.

  Examples:
    >>> _extract_command('\\section{foo}')
    '\\\\section'
    >>> _extract_command('\\\\texttt{foo} \\label{bar}')
    '\\\\texttt'
    >>> _extract_command('% \\\\foo')
    None
  """

  if line[:1] == '\\' and (command_match := line.split('{', 1)[0])[1:]:
    # Handle match on labels, etc
    if command_match.count('\\') > 1:
      return command_match.split(maxsplit=1)[0]
    # Default to prefix match
    return command_match
  else:
    # No match
    return None

def _extract_key(command: str, string: str, sol='^') -> Union[str, None]:
  """Extracts a key name from a line.

  Args:
    command: The LaTeX command to match.
    string: The intput string to match against.
    sol: The start of line character to match against. Defaults to '^'.

  Returns:
    The matched key name or None if no match was found.

  Examples:
    >>> _extract_key('\\\\texttt', '\\\\texttt{foo}')
    'foo'
    >>> _extract_key('\\\\texttt', '\\\\texttt{foo} \\label{bar}')
    'foo'
    >>> _extract_key('\\\\texttt', '\\\\texttt{foo} \\texttt{bar}', sol='')
    'bar'
    >>> _extract_key('\\\\label', '\\\\texttt{foo}')
    None
  """
  return re_search(f'{sol}\{command}\{{(.*?)\}}', string, group=1)

def _extract_value(command: str,
                   string: str,
                   key: str,
                   eol='\\\\\\\\'
                   ) -> Union[str, None]:
  """Extracts a key value from a line.

  Args:
    command: The LaTeX command to match.
    string: The intput string to match against.
    key: The key name to match against.
    eol: The end of line character to match against. Defaults to '\\\\\\\\'.

  Returns:
    The matched key value or None if no match was found.

  Examples:
    >>> _extract_value('\\\\textbf', '\\\\textbf{Type}: foo\\\\\\\\', 'Type')
    'foo'
    >>> _extract_value('\\\\textbf', '\\\\textbf{Type}: foo \\\\texttt{bar}\\\\\\\\', 'Type')
    'foo \\\\texttt{bar}'
  """
  ln = re_search(f'\{command}\{{{key}\}}:\s?(.*){eol}$', string, group=1)
  if not ln or not eol: return ln
  return _extract_key(command='\.*?', string=ln.strip(), sol='^') or ln

def _parse_attributes(line: str, key: str) -> str:
  """Parses additional attributes from a line.

  Args:
    line: The line to parse.
    key: The key to append attributes to.

  Returns:
    The key with appended attributes.

  Example:
    >>> _parse_attributes('\\\\textbf{Type}: \\\\texttt{foo}, bar\\\\\\\\', 'Type')
    'foo, bar'
  """
  skey = key.replace('\\', '\\\\')
  if attr := re_search(f'\{{{skey}\}},?\s?(.*?)\\\\\\\\$', line, group=1):
    key += f", {attr}"
  return key

def _normalize_lines(entry: str) -> str:
  """Normalizes indentation errors."""
  lines = []
  for i, e in enumerate(entry.split('\n\n\n\n')):
    line_fmt = e \
      .replace('\n\n  ', ' ' if i else '\n') \
      .replace('\\newline ', '\n') \
      .strip()
    # Handle inconsistent indentation in the primary entry
    line_fmt = "".join(f"\n{l}" if l[:1] == '\\' and (not i or '}:' in l) \
                            else f" {l}" \
                        for i, l in enumerate(line_fmt.split('\n')))

    lines.append(line_fmt.strip())

  # Handle exceptions for specific commands
  line_fmt = "\n\n".join(lines).replace('  ', '') \
    .replace(cmd := '\\begin{itemize}', f'\n{cmd}') \
    .replace(f'\n\n{cmd}', f'\n{cmd}') \
    .replace(cmd := '\\tightlist',      f'\n  {cmd}') \
    .replace(cmd := '\\item',           f'\n  {cmd}') \
    .replace(cmd := '\\end{itemize}',   f'\n{cmd}')

  return line_fmt

################################################################################
#                              Entry Parsing Macros                            #
################################################################################

def _reset_key_entry(cursor: dict):
  """Resets the cursor's key attributes."""
  cursor['key'] = None
  cursor['type'] = None
  cursor['value'] = None
  cursor['entry'] = None

def _parse_key_entry(cursor: dict,
                     schema: dict,
                     sample: dict,
                     /,
                     tree: Optional[list] = None,
                     raw_schema: Optional[dict] = None
                     ):
  """Parses the cursor's key attributes and adds them to the schema."""

  # Allow for overriding the tree when recursing
  if not tree: tree = cursor['tree'].copy()

  # Attempt to fix values inconsistent with the sample plist
  if not nested_get(sample, tree):
    # 'Entry Properties' has an inconsistent name
    if tree == ['Misc', 'Entry']:
      tree[-1] = 'Entries'
      # These properties are also applicable to `Misc.Tools`
      _parse_key_entry(cursor, schema, sample,
                       tree=('Misc', 'Tools'),
                       raw_schema=raw_schema)
    # 'Memory Device Properties' has an inconsistent name
    if tree == ['PlatformInfo', 'Memory', 'Device']:
      tree[-1] = 'Devices'

  # Add entries to the schema
  entry_tree = _add_schema_entry(tree, cursor, schema)

  # If specified, add the raw LaTeX entry to the raw schema
  if entry_tree and raw_schema is not None:
    # Maps the entry tree to a <key>.<key>[idx].<key>... string
    tree = "".join(f'[{k}]' if isinstance(k, int) else f'.{k}'
                    for k in entry_tree)[1:]
    nested_set(raw_schema, (tree,), cursor['entry'])

def _parse_exclusion_rules(cursor: dict) -> Tuple[Set[str], Set[str]]:
  """Parses exclusion rules from an entry description."""

  # Set default allowlist based on parent key
  valid = set(cursor['tree'][-1:])
  invalid = set()

  if entry := cursor['entry']:
    # Parse description for exclusion/inclusion rules
    valid_pattern = r'\\emph\{Note\}: This option is only valid for \\texttt\{(.*?)\}'
    if valid_match := re_search(valid_pattern, entry, group=1):
      valid = set((valid_match,))
    invalid_pattern = valid_pattern + r' and cannot be specified for \\texttt\{(.*?)\}'
    if invalid_match := re_search(invalid_pattern, entry, group=2):
      invalid = set((invalid_match,))

  return valid, invalid

def _parse_failsafe(stype: str,
                    svalue: str
                    ) -> Union[str, int, float, dict, list]:
  """Parses a failsafe value."""

  # Remove leading and trailing whitespace
  atype = stype\
    .strip() \
    .replace('plist\\ ', '') \
    .split(',')[0] \
    .capitalize()

  if atype in ('Integer', 'Real'):
    atype = 'Number'
    # Handle typecasting for hexadecimal values
    if svalue[:2] == '0x':
      svalue = str(int(svalue, 16))
  elif atype in ('Data', 'Multidata'):
    atype = 'Data'
    if svalue[:2] == '0x':
      svalue = svalue[2:]
    # Handle filling size-specified data values
    if   size := re_search(', (\d+) bits?', stype, group=1):
      svalue = '0' * 1 * int(size)
    elif size := re_search(', (\d+) bytes?', stype, group=1):
      svalue = '0' * 2 * int(size)
  elif atype in ('Dictionary'):
    atype = 'Dict'

  if svalue.lower().split(maxsplit=1)[0] in ('empty',):
    svalue = ''

  # Ensure value is serializable as an annotated YAML type
  value = parse_yaml_types(atype, svalue, schema='annotated')

  return value

def _add_schema_entry(tree: list, cursor: dict, schema: dict) -> List[str]:
  """Parse and add failsafe values"""

  # Ensure that the parent entry is a valid target
  valid, invalid = _parse_exclusion_rules(cursor)
  pkey = tree[-1]
  if pkey not in valid and pkey in invalid: return

  # Parse failsafe value
  key = cursor['key']
  stype = cursor['type']
  svalue = cursor['value'] or 'Empty'
  entry = _parse_failsafe(stype, svalue)

  # Ensure that the parent entry exists in the schema
  parent_entry = nested_get(schema, tree, default={})
  if not parent_entry: nested_set(schema, tree, {})

  # Add entry to the schema
  if   isinstance(parent_entry, dict):
    entry_tree = (*tree, key)
    nested_set(schema, entry_tree, entry)
  elif isinstance(parent_entry, list):
    if not parent_entry: parent_entry = [{}]
    # Only one dict entry is allowed per array
    entry_tree = (*tree, 0, key)
    parent_entry[0][key] = entry
    nested_set(schema, tree, parent_entry)

  # Return the entry tree for validation
  return entry_tree

################################################################################
#                              Schema Parsing Methods                          #
################################################################################

def _is_boundary(lnorm: str, indent: str, entry: str) -> bool:
  """Checks if the line is a valid boundary command."""

  # Check if the line is a valid root command
  if lnorm[:1] == '\\' and indent <= 1:
    return True
  if not any(lnorm.startswith(c) for c in ('\item', '\end{enumerate}')):
    return False
  # Handle false positives for \item commands
  list_count = entry.count('\\begin{itemize}')
  is_building_list = list_count == 1 + entry.count('\\end{itemize}') > 0
  if lnorm != '\item' or is_building_list:
    return False

  return True

def _is_nested_boundary(cursor: dict, indent: str, entry: str) -> bool:
  """Checks if the line is a valid nested boundary command."""

  # Handle false positives for nested \item commands
  if cursor['type'] and indent >= 2:
    trailing_command = entry.rstrip().rsplit(maxsplit=1)[-1]
    is_nested_list = trailing_command in ('\\begin{itemize}', '\\tightlist')
    has_finished_list = trailing_command == '\\end{itemize}'
    # Is still a nested key entry
    if not is_nested_list and has_finished_list:
      return False

  return True

[docs]def parse_schema(file: Union[List[str], TextIOWrapper],
                 sample_plist: dict,
                 /,
                 raw_schema: Optional[dict] = None
                 ) -> dict:
  """Gets the Sample.plist schema from a Configuration.tex file.

  The Sample.plist schema is parsed from the Configuration.tex file, which is
  the source LaTeX file for the OpenCore documentation. These schema values
  match the declared failsafe values used as fallbacks in the OpenCore config
  file. Extracted schema values are then converted to a plist-compatible format
  and then coerced into native Python types.

  Args:
    file: The Configuration.tex file to parse.
    sample_plist: The sample plist to use for comparison.
    raw_schema: If specified, the raw schema will be stored in this dictionary.

  Returns:
    A dictionary representing failsafe values for the Sample.plist schema.
  """

  # Parse the configuration schema against the sample plist
  cursor = { 'tree': [], 'key': None, 'type': None, 'value': None, 'entry': '' }
  schema = {}
  for line in file:
    # Normalize line
    lnorm = str(lstrip := line.lstrip()).strip()
    # Skip comments
    if lnorm[:1] == '%': continue

    # Use root-level commands as boundaries for key entries
    indent = len(line) - len(lstrip)
    if (entry := cursor['entry'] or '') and _is_boundary(lnorm, indent, entry):
      # Add value to the schema if all required attributes are present
      if cursor['key'] and cursor['type']:
        # Normalize entry to fix formatting inconsistencies
        if not _is_nested_boundary(cursor, indent, entry):
          cursor['entry'] += f"\n{line}"
        cursor['entry'] = _normalize_lines(entry)
        # Add entry to the schema
        _parse_key_entry(cursor, schema, sample_plist, raw_schema=raw_schema)
      # Reset all attributes
      _reset_key_entry(cursor)
    # If set, continue storing the current LaTeX entry
    elif entry:
      cursor['entry'] += f"\n{line}"

    # Parse LaTeX commands
    if command := _extract_command(lnorm):
      # Parse section/subsection commands
      if command in ('\section', '\subsection', '\subsubsection'):
        name: str = re_search(f'\\{command}\{{(.*?)\}}', lnorm, group=1)
        # Is a section
        if command == '\section':
          entry = [name] if name in sample_plist.keys() else []
          cursor['tree'] = entry
        # Is a subsection
        else:
          key = re_search(r'([a-zA-Z0-9]+)\s?Properties', name, group=1)
          if command == '\subsection':
            entry = [key] if key else []
            cursor['tree'][1:] = entry
          elif command == '\subsubsection':
            entry = key.split(maxsplit=1)[:1] if key else []
            cursor['tree'][2:] = entry
      # Parse property fields for keys
      elif cursor['tree'] and command in ('\\texttt', '\\textbf'):
        if command == '\\texttt':
          # Is a key name
          if key := _extract_key(command, lnorm, sol=''):
            if not cursor['key']:
              cursor['key'] = key
              cursor['entry'] = line
        elif command == '\\textbf':
          # Is a key type
          if stype := _extract_value(command, lnorm, key='Type'):
            # Append additional attributes outside of the encapsulated value
            cursor['type'] = _parse_attributes(lnorm, stype)
          # Is a key value
          elif svalue := _extract_value(command, lnorm, key='Failsafe'):
            cursor['value'] = svalue

  return schema


__all__ = [
  # Functions (1)
  "parse_schema"
]