Marlin/buildroot/share/PlatformIO/scripts/schema.py

#!/usr/bin/env python3
"""
schema.py

Extract firmware configuration into structured JSON or YAML schema format.

Used by signature.py via common-dependencies.py to generate a schema file during the
PlatformIO build when CONFIG_EXPORT is defined in the configuration.

This script can also be run standalone from within the Marlin repo, and is a companion to
abm/js/schema.js in the MarlinFirmware/AutoBuildMarlin project, which has been extended to
evaluate conditions and can determine what options are actually enabled, not just which
options are uncommented. That will be migrated to this script for standalone migration.

Usage: schema.py [-h] [some|json|jsons|group|yml|yaml]

Process Marlin firmware configuration files (Configuration.h and Configuration_adv.h)
to produce structured output suitable for documentation, tooling, or automated processing.

Positional arguments:
  some      Generate both JSON and YAML output (schema.json and schema.yml)
  json      Generate JSON output (schema.json)
  jsons     Generate grouped JSON output with wildcard options (schema.json and schema_grouped.json)
  group     Generate grouped JSON output only (schema_grouped.json)
  yml       Generate YAML output (schema.yml)
  yaml      Same as 'yml'

Optional arguments:
  -h, --help  Show this help message and exit
"""

import re, json
from pathlib import Path

def extend_dict(d:dict, k:tuple):
    if len(k) >= 1 and k[0] not in d:
        d[k[0]] = {}
    if len(k) >= 2 and k[1] not in d[k[0]]:
        d[k[0]][k[1]] = {}
    if len(k) >= 3 and k[2] not in d[k[0]][k[1]]:
        d[k[0]][k[1]][k[2]] = {}

grouping_patterns = [
    re.compile(r'^([XYZIJKUVW]|[XYZ]2|Z[34]|E[0-7])$'),
    re.compile(r'^AXIS\d$'),
    re.compile(r'^(MIN|MAX)$'),
    re.compile(r'^[0-8]$'),
    re.compile(r'^HOTEND[0-7]$'),
    re.compile(r'^(HOTENDS|BED|PROBE|COOLER)$'),
    re.compile(r'^[XYZIJKUVW]M(IN|AX)$')
]
# If the indexed part of the option name matches a pattern
# then add it to the dictionary.
def find_grouping(gdict, filekey, sectkey, optkey, pindex):
    optparts = optkey.split('_')
    if 1 < len(optparts) > pindex:
        for patt in grouping_patterns:
            if patt.match(optparts[pindex]):
                subkey = optparts[pindex]
                modkey = '_'.join(optparts)
                optparts[pindex] = '*'
                wildkey = '_'.join(optparts)
                kkey = f'{filekey}|{sectkey}|{wildkey}'
                if kkey not in gdict: gdict[kkey] = []
                gdict[kkey].append((subkey, modkey))

# Build a list of potential groups. Only those with multiple items will be grouped.
def group_options(schema):
    for pindex in range(10, -1, -1):
        found_groups = {}
        for filekey, f in schema.items():
            for sectkey, s in f.items():
                for optkey in s:
                    find_grouping(found_groups, filekey, sectkey, optkey, pindex)

        fkeys = [ k for k in found_groups.keys() ]
        for kkey in fkeys:
            items = found_groups[kkey]
            if len(items) > 1:
                f, s, w = kkey.split('|')
                extend_dict(schema, (f, s, w))                      # Add wildcard group to schema
                for subkey, optkey in items:                        # Add all items to wildcard group
                    schema[f][s][w][subkey] = schema[f][s][optkey]  # Move non-wildcard item to wildcard group
                    del schema[f][s][optkey]
            del found_groups[kkey]

# Extract all board names from boards.h
def load_boards():
    bpath = Path("Marlin/src/core/boards.h")
    if bpath.is_file():
        with bpath.open(encoding='utf-8') as bfile:
            boards = []
            for line in bfile:
                if line.startswith("#define BOARD_"):
                    bname = line.split()[1]
                    if bname != "BOARD_UNKNOWN": boards.append(bname)
            return "['" + "','".join(boards) + "']"
    return ''

#
# Extract the specified configuration files in the form of a structured schema.
# Contains the full schema for the configuration files, not just the enabled options,
# Contains the current values of the options, not just data structure, so "schema" is a slight misnomer.
#
# The returned object is a nested dictionary with the following indexing:
#
#  - schema[filekey][section][define_name] = define_info
#
# Where the define_info contains the following keyed fields:
#    - section  = The @section the define is in
#    - name     = The name of the define
#    - enabled  = True if the define is enabled (not commented out)
#    - line     = The line number of the define
#    - sid      = A serial ID for the define
#    - value    = The value of the define, if it has one
#    - type     = The type of the define, if it has one
#    - requires = The conditions that must be met for the define to be enabled
#    - comment  = The comment for the define, if it has one
#    - units    = The units for the define, if it has one
#    - options  = The options for the define, if it has any
#
def extract_files(filekey):
    # Load board names from boards.h
    boards = load_boards()

    # Parsing states
    class Parse:
        NORMAL          = 0 # No condition yet
        BLOCK_COMMENT   = 1 # Looking for the end of the block comment
        EOL_COMMENT     = 2 # EOL comment started, maybe add the next comment?
        SLASH_COMMENT   = 3 # Block-like comment, starting with aligned //
        GET_SENSORS     = 4 # Gathering temperature sensor options
        ERROR           = 9 # Syntax error

    # A JSON object to store the data
    sch_out = { key:{} for key in filekey.values() }
    # Regex for #define NAME [VALUE] [COMMENT] with sanitized line
    defgrep = re.compile(r'^(//)?\s*(#define)\s+([A-Za-z0-9_]+)\s*(.*?)\s*(//.+)?$')
    # Pattern to match a float value
    flt = r'[-+]?\s*(\d+\.|\d*\.\d+)([eE][-+]?\d+)?[fF]?'
    # Start with unknown state
    state = Parse.NORMAL
    # Serial ID
    sid = 0
    # Loop through files and parse them line by line
    for fn, fk in filekey.items():
        with Path("Marlin", fn).open(encoding='utf-8') as fileobj:
            section = 'none'        # Current Settings section
            line_number = 0         # Counter for the line number of the file
            conditions = []         # Create a condition stack for the current file
            comment_buff = []       # A temporary buffer for comments
            prev_comment = ''       # Copy before reset for an EOL comment
            options_json = ''       # A buffer for the most recent options JSON found
            eol_options = False     # The options came from end of line, so only apply once
            join_line = False       # A flag that the line should be joined with the previous one
            line = ''               # A line buffer to handle \ continuation
            last_added_ref = {}     # Reference to the last added item
            # Loop through the lines in the file
            for the_line in fileobj.readlines():
                line_number += 1

                # Clean the line for easier parsing
                the_line = the_line.strip()

                if join_line:   # A previous line is being made longer
                    line += (' ' if line else '') + the_line
                else:           # Otherwise, start the line anew
                    line, line_start = the_line, line_number

                # If the resulting line ends with a \, don't process now.
                # Strip the end off. The next line will be joined with it.
                join_line = line.endswith("\\")
                if join_line:
                    line = line[:-1].strip()
                    continue
                else:
                    line_end = line_number

                defmatch = defgrep.match(line)

                # Special handling for EOL comments after a #define.
                # At this point the #define is already digested and inserted,
                # so we have to extend it
                if state == Parse.EOL_COMMENT:
                    # If the line is not a comment, we're done with the EOL comment
                    if not defmatch and the_line.startswith('//'):
                        comment_buff.append(the_line[2:].strip())
                    else:
                        state = Parse.NORMAL
                        cline = ' '.join(comment_buff)
                        comment_buff = []
                        if cline != '':
                            # A (block or slash) comment was already added
                            cfield = 'notes' if 'comment' in last_added_ref else 'comment'
                            last_added_ref[cfield] = cline

                #
                # Add the given comment line to the comment buffer, unless:
                # - The line starts with ':' and JSON values to assign to 'opt'.
                # - The line starts with '@section' so a new section needs to be returned.
                # - The line starts with '======' so just skip it.
                #
                def use_comment(c, opt, sec, bufref):
                    '''
                    c       - The comment line to parse
                    opt     - Options JSON string to return (if not updated)
                    sec     - Section to return (if not updated)
                    bufref  - The comment buffer to add to
                    '''
                    sc = c.strip()                      # Strip for special patterns
                    if sc.startswith(':'):              # If the comment starts with : then it has magic JSON
                        d = sc[1:].strip()              # Strip the leading : and spaces
                        # Look for a JSON container
                        cbr = sc.rindex('}') if d.startswith('{') else sc.rindex(']') if d.startswith('[') else 0
                        if cbr:
                            opt, cmt = sc[1:cbr+1].strip(), sc[cbr+1:].strip()
                            if cmt != '': bufref.append(cmt)
                        else:
                            opt = sc[1:].strip()        # Some literal value not in a JSON container?
                    else:
                        m = re.match(r'@section\s*(.+)', sc) # Start a new section?
                        if m:
                            sec = m[1]
                        elif not sc.startswith('========'):
                            bufref.append(c)            # Anything else is part of the comment
                    return opt, sec

                # For slash comments, capture consecutive slash comments.
                # The comment will be applied to the next #define.
                if state == Parse.SLASH_COMMENT:
                    if not defmatch and the_line.startswith('//'):
                        options_json, section = use_comment(the_line[2:].strip(), options_json, section, comment_buff)
                        continue
                    else:
                        state = Parse.NORMAL

                # In a block comment, capture lines up to the end of the comment.
                # Assume nothing follows the comment closure.
                if state in (Parse.BLOCK_COMMENT, Parse.GET_SENSORS):
                    endpos = line.find('*/')
                    if endpos < 0:
                        cline = line
                    else:
                        cline, line = line[:endpos].strip(), line[endpos+2:].strip()

                        # Temperature sensors are done
                        if state == Parse.GET_SENSORS:
                            options_json = f'[ {options_json[:-2]} ]'
                        state = Parse.NORMAL

                    # Strip the leading '* ' from block comments
                    cline = re.sub(r'^\* ?', '', cline)

                    # Collect temperature sensors
                    if state == Parse.GET_SENSORS:
                        sens = re.match(r'^\s*(-?\d+)\s*:\s*(.+)$', cline)
                        if sens:
                            s2 = sens[2].replace("'", "''")
                            options_json += f"{sens[1]}:'{sens[1]} - {s2}', "

                    elif state == Parse.BLOCK_COMMENT:

                        # Look for temperature sensors
                        if re.match(r'temperature sensors.*:', cline, re.IGNORECASE):
                            state, cline = Parse.GET_SENSORS, "Temperature Sensors"

                        options_json, section = use_comment(cline, options_json, section, comment_buff)

                # For the normal state we're looking for any non-blank line
                elif state == Parse.NORMAL:
                    # Skip a commented define when evaluating comment opening
                    st = 2 if re.match(r'^//\s*#define', line) else 0
                    cpos1 = line.find('/*')     # Start a block comment on the line?
                    cpos2 = line.find('//', st) # Start an end of line comment on the line?

                    # Only the first comment starter gets evaluated
                    cpos = -1
                    if cpos1 != -1 and (cpos1 < cpos2 or cpos2 == -1):
                        cpos = cpos1
                        comment_buff = []
                        state = Parse.BLOCK_COMMENT
                        eol_options = False
                    elif cpos2 != -1 and (cpos2 < cpos1 or cpos1 == -1):
                        cpos = cpos2

                        # Comment after a define may be continued on the following lines
                        if defmatch is not None and cpos > 10:
                            state = Parse.EOL_COMMENT
                            prev_comment = '\n'.join(comment_buff)
                            comment_buff = []
                        else:
                            state = Parse.SLASH_COMMENT

                    # Process the start of a new comment
                    if cpos != -1:
                        comment_buff = []
                        cline, line = line[cpos+2:].strip(), line[:cpos].strip()

                        if state == Parse.BLOCK_COMMENT:
                            # Strip leading '*' from block comments
                            cline = re.sub(r'^\* ?', '', cline)
                        else:
                            # Expire end-of-line options after first use
                            if cline.startswith(':'): eol_options = True

                        # Buffer a non-empty comment start
                        if cline != '':
                            options_json, section = use_comment(cline, options_json, section, comment_buff)

                    # If the line has nothing before the comment, go to the next line
                    if line == '':
                        options_json = ''
                        continue

                    # Parenthesize the given expression if needed
                    def atomize(s):
                        if s == '' \
                        or re.match(r'^[A-Za-z0-9_]*(\([^)]+\))?$', s) \
                        or re.match(r'^[A-Za-z0-9_]+ == \d+?$', s):
                            return s
                        return f'({s})'

                    #
                    # The conditions stack is an array containing condition-arrays.
                    # Each condition-array lists the conditions for the current block.
                    # IF/N/DEF adds a new condition-array to the stack.
                    # ELSE/ELIF/ENDIF pop the condition-array.
                    # ELSE/ELIF negate the last item in the popped condition-array.
                    # ELIF adds a new condition to the end of the array.
                    # ELSE/ELIF re-push the condition-array.
                    #
                    cparts = line.split()
                    iselif, iselse = cparts[0] == '#elif', cparts[0] == '#else'
                    if iselif or iselse or cparts[0] == '#endif':
                        if len(conditions) == 0:
                            raise Exception(f'no #if block at line {line_number}')

                        # Pop the last condition-array from the stack
                        prev = conditions.pop()

                        if iselif or iselse:
                            prev[-1] = '!' + prev[-1] # Invert the last condition
                            if iselif: prev.append(atomize(line[5:].strip()))
                            conditions.append(prev)

                    elif cparts[0] == '#if':
                        conditions.append([ atomize(line[3:].strip()) ])
                    elif cparts[0] == '#ifdef':
                        conditions.append([ f'defined({line[6:].strip()})' ])
                    elif cparts[0] == '#ifndef':
                        conditions.append([ f'!defined({line[7:].strip()})' ])

                    # Handle a complete #define line
                    elif defmatch is not None:

                        # Get the match groups into vars
                        enabled, define_name, val = defmatch[1] is None, defmatch[3], defmatch[4]

                        # Increment the serial ID
                        sid += 1

                        # Create a new dictionary for the current #define
                        define_info = {
                            'section': section,
                            'name': define_name,
                            'enabled': enabled,
                            'line': line_start,
                            'sid': sid
                        }

                        # Type is based on the value
                        value_type = \
                             'switch'  if val == '' \
                        else 'int'     if re.match(r'^[-+]?\s*\d+$', val) \
                        else 'ints'    if re.match(r'^([-+]?\s*\d+)(\s*,\s*[-+]?\s*\d+)+$', val) \
                        else 'floats'  if re.match(rf'({flt}(\s*,\s*{flt})+)', val) \
                        else 'float'   if re.match(f'^({flt})$', val) \
                        else 'string'  if val[0] == '"' \
                        else 'char'    if val[0] == "'" \
                        else 'bool'    if val in ('true', 'false') \
                        else 'state'   if val in ('HIGH', 'LOW') \
                        else 'enum'    if re.match(r'^[A-Za-z0-9_]{3,}$', val) \
                        else 'int[]'   if re.match(r'^{\s*[-+]?\s*\d+(\s*,\s*[-+]?\s*\d+)*\s*}$', val) \
                        else 'float[]' if re.match(r'^{{\s*{flt}(\s*,\s*{flt})*\s*}}$', val) \
                        else 'array'   if val[0] == '{' \
                        else ''

                        val = (val == 'true')           if value_type == 'bool' \
                        else int(val)                   if value_type == 'int' \
                        else val.replace('f','')        if value_type == 'floats' \
                        else float(val.replace('f','')) if value_type == 'float' \
                        else val

                        if val != '': define_info['value'] = val
                        if value_type != '': define_info['type'] = value_type

                        # Join up accumulated conditions with &&
                        if conditions: define_info['requires'] = '(' + ') && ('.join(sum(conditions, [])) + ')'

                        # If the comment_buff is not empty, add the comment to the info
                        if comment_buff:
                            full_comment = '\n'.join(comment_buff).strip()

                            # An EOL comment will be added later
                            # The handling could go here instead of above
                            if state == Parse.EOL_COMMENT:
                                define_info['comment'] = ''
                            else:
                                define_info['comment'] = full_comment
                                comment_buff = []

                            # If the comment specifies units, add that to the info
                            units = re.match(r'^\(([^)]+)\)', full_comment)
                            if units:
                                units = units[1]
                                if units in ('s', 'sec'): units = 'seconds'
                                define_info['units'] = units

                        if 'comment' not in define_info or define_info['comment'] == '':
                            if prev_comment:
                                define_info['comment'] = prev_comment
                                prev_comment = ''

                        if 'comment' in define_info and define_info['comment'] == '':
                            del define_info['comment']

                        # Set the options for the current #define
                        if define_name == "MOTHERBOARD" and boards != '':
                            define_info['options'] = boards
                        elif options_json != '':
                            define_info['options'] = options_json
                            if eol_options: options_json = ''

                        # Create section dict if it doesn't exist yet
                        if section not in sch_out[fk]: sch_out[fk][section] = {}

                        # If define has already been seen...
                        if define_name in sch_out[fk][section]:
                            info = sch_out[fk][section][define_name]
                            if isinstance(info, dict): info = [ info ]  # Convert a single dict into a list
                            info.append(define_info)                    # Add to the list
                        else:
                            # Add the define dict with name as key
                            sch_out[fk][section][define_name] = define_info

                        if state == Parse.EOL_COMMENT:
                            last_added_ref = define_info

    return sch_out

#
# Extract the current configuration files in the form of a structured schema.
#
def extract():
    # List of files to process, with shorthand
    return extract_files({ 'Configuration.h':'basic', 'Configuration_adv.h':'advanced' })

def dump_json(schema:dict, jpath:Path):
    with jpath.open('w', encoding='utf-8') as jfile:
        json.dump(schema, jfile, ensure_ascii=False, indent=2)

def dump_yaml(schema:dict, ypath:Path):
    import yaml

    # Custom representer for all multi-line strings
    def str_literal_representer(dumper, data):
        if '\n' in data:  # Check for multi-line strings
            # Add a newline to trigger '|+'
            if not data.endswith('\n'): data += '\n'
            return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|')
        return dumper.represent_scalar('tag:yaml.org,2002:str', data)

    yaml.add_representer(str, str_literal_representer)

    with ypath.open('w', encoding='utf-8') as yfile:
        yaml.dump(schema, yfile, default_flow_style=False, width=120, indent=2)

def main():
    try:
        schema = extract()
    except Exception as exc:
        print("Error: " + str(exc))
        schema = None

    if schema:

        # Get the command line arguments after the script name
        import sys
        args = sys.argv[1:]
        if len(args) == 0: args = ['some']

        # Does the given array intersect at all with args?
        def inargs(c): return len(set(args) & set(c)) > 0

        # Help / Unknown option
        unk = not inargs(['some','json','jsons','group','yml','yaml', '-h', '--help'])
        if (unk): print(f"Unknown option: '{args[0]}'")
        if inargs(['-h', '--help']) or unk:
            print("Usage: schema.py [-h] [some|json|jsons|group|yml|yaml]")
            print("       some  = json + yml")
            print("       jsons = json + group")
            return

        # JSON schema
        if inargs(['some', 'json', 'jsons']):
            print("Generating JSON ...")
            dump_json(schema, Path('schema.json'))

        # JSON schema (wildcard names)
        if inargs(['group', 'jsons']):
            group_options(schema)
            dump_json(schema, Path('schema_grouped.json'))

        # YAML
        if inargs(['some', 'yml', 'yaml']):
            try:
                import yaml
            except ImportError:
                print("Installing YAML module ...")
                import subprocess
                try:
                    subprocess.run(['python3', '-m', 'pip', 'install', 'pyyaml'])
                    import yaml
                except:
                    print("Failed to install YAML module")
                    return

            print("Generating YML ...")
            dump_yaml(schema, Path('schema.yml'))

if __name__ == '__main__':
    main()