glsl_analyzer/spec/gen_spec.py

#!/usr/bin/env python3

import sys
import os
from glob import iglob
import json
from bs4 import BeautifulSoup
import tokenize
import re

keywords = []
types = []
operators = []
variables = []
functions = []

def process_glsl_html_spec(path):
    with open(path, 'r') as f:
        soup = BeautifulSoup(f, 'html.parser')

    keywords_section = soup.find(id='keywords').parent
    for index, dl in enumerate(keywords_section.find_all('dl')):
        names = [t.getText() for t in dl.find_all('strong')]

        kind = 'glsl'
        if index == 1: kind = 'vulkan'
        if index == 2: kind = 'reserved'

        for name in names:
            keywords.append({
                'name': name,
                'kind': kind,
            })

    basic_types_section = soup.find(id='basic-types').parent
    for table in basic_types_section.find_all('table'):
        headers = table.find_all('th')
        if headers[0].getText() != 'Type': continue
        if headers[1].getText() != 'Meaning': continue
        rows = table.find('tbody').find_all('tr')

        for row in rows:
            name_cell, meaning_cell = row.find_all('td')
            names = name_cell.getText().splitlines()
            meaning = ' '.join(meaning_cell.getText().split())

            for name in names:
                types.append({
                    'name': name,
                    'description': [meaning],
                })

    operators_section = soup.find(id='operators').parent
    operator_table = operators_section.find('table')
    headers = operator_table.find_all('th')
    assert headers[0].getText() == 'Precedence'
    assert headers[1].getText() == 'Operator Class'
    assert headers[2].getText() == 'Operators'
    assert headers[3].getText() == 'Associativity'

    rows = operator_table.find('tbody').find_all('tr')
    ignored_operators = ['(', ')', '[', ']', '.', ',']
    for row in rows:
        precedence, operator_class, operator_words, associativity = row.find_all('td')

        precedence_number = int(precedence.getText().split()[0])
        left_to_right = associativity.getText() == 'Left to Right'

        operator_class = operator_class.getText()
        kind = 'infix'
        if 'prefix' in operator_class: kind = 'prefix'
        if 'post fix' in operator_class: kind = 'postfix'

        for word in operator_words.getText().split():
            if word in ignored_operators: continue
            operators.append({
                'name': word,
                'precedence': precedence_number,
                'left_to_right': left_to_right,
                'kind': kind,
            })


def process_docs_gl_file(path):
    is_variable = os.path.basename(path).startswith('gl_')

    with open(path, 'r') as f:
        soup = BeautifulSoup(str(f.read()), 'html.parser')

        desc_node = soup.find(id='description')
        if desc_node is None: return
        paragraphs = desc_node.find_all('p')
        description = [paragraph_to_markdown(p) for p in paragraphs]

        versions_table = soup.find(id='versions')
        versions_header = versions_table.find('thead').find_all('tr')[-1]
        versions = [int(v.getText().replace('.', '')) for v in versions_header.find_all('th')[1:]]

        versions_body = versions_table.find('tbody').find_all('tr')
        version_support = []
        for row in versions_body:
            datas = row.find_all('td')
            supported = [v for v, data in zip(versions, datas[1:]) if data.getText().strip() != '-']
            version_support.append(supported)

        if is_variable:
            for node in soup.find_all(attrs={'class':'fieldsynopsis'}):
                variable = parse_variable(node.getText())
                variable['description'] = description
                variable['versions'] = version_support[0]
                variables.append(variable)
            if 'gl_Position' in path:
                variable = { 'name': 'gl_Position', 'type': 'vec4' }
                variable['description'] = description
                variable['versions'] = version_support[0]
                variables.append(variable)
        else:
            for i, node in enumerate(soup.find_all(attrs={'class':'funcprototype-table'})):
                prototype = parse_prototype(node.getText())
                prototype['description'] = description
                prototype['versions'] = version_support[min(i, len(version_support) - 1)]
                functions.append(prototype)

def paragraph_to_markdown(paragraph):
    if paragraph.math is not None and paragraph.math.mtable is not None:
        return '```\n' + expand_math(paragraph.math.mtable).replace('δ  ', 'δ') + '\n```\n'

    for tag in paragraph.find_all('em'):
        tag.replace_with('_' + tag.getText() + '_')

    for tag in paragraph.find_all('code'):
        tag.replace_with('`' + tag.getText() + '`')

    for tag in paragraph.find_all('math'):
        tag.replace_with('`' + escape_math(tag) + '`')

    return ' '.join(paragraph.getText().split())

def math_children(node):
    children = []
    for child in node.children:
        if child.name is None: continue
        children.append(child)
    return children

def escape_math(node):
    return ' '.join(expand_math(node).split(" \t\r")).replace('δ  ', 'δ')

def expand_math(node):
    if node.name is None or node.name in ['mi', 'mn', 'mo']:
        text = node.getText().strip()
        if text == '\u2061': return ''
        return text

    if node.name == 'mrow' or node.name == 'math':
        return ' '.join([expand_math(c) for c in node.children])

    if node.name == 'mfrac':
        parts = math_children(node)
        return f'{expand_math(parts[0])} / {expand_math(parts[1])}'

    if node.name == 'mfenced':
        open = node['open']
        close = node['close']
        return open + ' '.join([expand_math(c) for c in node.children]).strip() + close

    if node.name == 'msup':
        parts = math_children(node)
        return expand_math(parts[0]) + '**' + expand_math(parts[1])

    if node.name == 'msub':
        parts = math_children(node)
        return expand_math(parts[0]) + '_' + expand_math(parts[1])

    if node.name == 'msubsup':
        parts = math_children(node)
        return (expand_math(parts[0])
            + '_' + expand_math(parts[1])
            + '^' + expand_math(parts[2]))

    if node.name == 'mtable':
        rows = node.find_all('mtr')
        res = ''
        for row in rows:
            cols = row.find_all('mtd')
            for col in cols:
                res += ' '.join([expand_math(c) for c in col.children]) + '  '
            res += '\n'
        return res

    if node.name == 'msqrt':
        inner = ' '.join([expand_math(c) for c in math_children(node)])
        return f'sqrt({inner})'

    raise Exception(f'unknown math node {node.name}: {node}')

modifier_keywords = ['out', 'in', 'inout', 'const', 'highp', 'lowp', 'perprimitiveEXT']
modifier_regex = '(' + '|'.join(modifier_keywords) + ')'

type_regex = r'(void|int|float|double|bool|\w*(vec|mat|gen\w*Type)\w*)'

def parse_variable(text):
    tokens = tokenize(text)

    variable = {}

    i = 0
    while tokens[i] in modifier_keywords:
        if 'modifiers' in variable: variable['modifiers'] += ' ' + tokens[i]
        else: variable['modifiers'] = tokens[i]
        i += 1

    variable['type'] = tokens[i]; i += 1
    variable['name'] = tokens[i]; i += 1

    if tokens[i] == '[':
        start = i
        while tokens[i] != ']': i += 1
        i += 1
        variable['type'] += ''.join(tokens[start:i])

    if tokens[i] == '=':
        i += 1
        start = i
        while tokens[i] != ';': i += 1
        variable['default_value'] = ' '.join(tokens[start:i])

    assert tokens[i] == ';'

    return variable

def parse_prototype(text):
    tokens = tokenize(text)

    func_output = tokens[0]
    func_name = tokens[1]
    i = 2
    while i < len(tokens) and tokens[i] != '(':
        func_output = func_name
        func_name = tokens[i]
        i += 1
    i += 1

    parameters = []
    while i < len(tokens) - 2:
        parameter = {}

        optional = tokens[i] == '['
        if optional:
            i += 1
            parameter['optional'] = True

        while tokens[i] in modifier_keywords:
            if 'modifiers' in parameter: parameter['modifiers'] += ' ' + tokens[i]
            else: parameter['modifiers'] = tokens[i]
            i += 1

        param_type = tokens[i]
        if param_type == 'void': break
        parameter['type'] = param_type
        i += 1

        param_name = tokens[i]
        if param_name[0].isalnum():
            parameter['name'] = param_name
            i += 1

        if tokens[i] == '[':
            array_start = i
            while i < len(tokens) and tokens[i] != ']':
                i += 1
            i += 1
            parameter['type'] += ''.join(tokens[array_start:i])

        parameters.append(parameter)

        if optional: assert tokens[i] == ']'; i += 1
        if tokens[i] == ',': i += 1

    assert tokens[-1] == ')' or (tokens[-2] == ')' and tokens[-1] == ';')

    return {
        'return_type':func_output,
        'name':func_name,
        'parameters':parameters,
    }


def tokenize(text):
    i = 0
    N = len(text)
    tokens = []
    while i < N:
        if text[i].isspace(): i += 1; continue

        if text[i].isalnum():
            start = i
            i += 1
            while i < N and (text[i].isalnum() or text[i] == '_'): i += 1
            tokens.append(text[start:i])
            continue

        tokens.append(text[i])
        i += 1;
    return tokens

def group_by_indent(text: str, tab_indent=None):
    stack = [[]]
    indents = [0]
    indentation = 4
    for line in text.splitlines():
        i = 0
        for ch in line:
            if ch == ' ': i += 1; continue
            if ch == '\t': i += tab_indent or indentation; continue
            break

        line = line[i:]

        if i % 4 == 2:
            indentation = 2

        # Round to nearest multiple of indentation to account for minor errors
        i = int(round(i / indentation) * indentation)

        if len(line) == 0:
            stack[-1].append('')
            continue

        while i < indents[-1]:
            last = stack.pop()
            stack[-1].append(last)
            indents.pop()

        if i == indents[-1] or len(line) == 0:
            stack[-1].append(line)
            continue

        if i > indents[-1]:
            stack.append([line])
            indents.append(i)
            continue

    while len(stack) > 1:
        last = stack.pop()
        stack[-1].append(last)

    return stack[0]

def find_matching_groups(groups, pattern, flags=0):
    start = None
    last_end = 0
    text = ''

    for i, item in enumerate(groups):
        if isinstance(item, str):
            text += '\n' + item
            if re.search(pattern, text, flags) is None: continue
            start = last_end
        else:
            text = ''
            last_end = i+1

            if start is not None:
                yield groups[start:i+1]
                start = None
                continue

            for match in find_matching_groups(item, pattern, flags):
                yield match

    if start is not None:
        yield groups[start:]

def flatten(xs):
    for x in xs:
        if isinstance(x, list):
            for value in flatten(x): yield value
        else:
            yield x

def find_prototypes(text):
    syntaxes = []
    descriptions = []

    for match in re.finditer(r'(\s*(\+(-*\+)+|\|([^|]*\|)+))+', text, re.MULTILINE):
        table_text = text[match.start():match.end()].strip()

        header = True
        for i, line in enumerate(table_text.splitlines()):
            line = line.strip()
            if line[0] == '+':
                if i != 0:
                    syntaxes.append('')
                    descriptions.append('')
                continue

            pieces = line.split('|')
            if len(pieces) != 4: break

            if header:
                if re.match(r'\s*(syntax|function)\s*', pieces[1], re.IGNORECASE) is None: break
                if re.match(r'\s*description\s*', pieces[2], re.IGNORECASE) is None: break
                header = False
                continue

            syntaxes[-1] += pieces[1]
            descriptions[-1] += pieces[2] + '\n'

    for i in range(len(syntaxes)):
        syntaxes[i] = ' '.join(syntaxes[i].strip().split())
        descriptions[i] = descriptions[i].strip().split('\n\n')

    syntaxes.append(text)
    descriptions.append(None)

    found = set()

    for syntax, description in zip(syntaxes, descriptions):
        for match in re.finditer(r'((\w+\s+)+\w+\([^()]*\))', syntax, re.MULTILINE):
            match_text = syntax[match.start(1):match.end(1)]

            if '|' in match_text: continue

            proto = parse_prototype(match_text)

            if proto['name'] in found: continue
            found.add(proto['name'])

            if description is not None:
                proto['description'] = description

            return_type = proto['return_type']
            if re.search(type_regex, return_type) is not None:
                yield proto
            else:
                known_invalid_pattern = r'(functions?|of|enable|the|and|to|call|with|if|as|in)'
                if re.match(known_invalid_pattern, return_type, re.IGNORECASE) is None:
                    print('unknown return type:', match_text)

def process_extension_file(path):
    filename = os.path.basename(path)
    with open(path, 'r') as f:
        text = str(f.read())

        tab_indent = 4
        if filename == 'GL_HUAWEI_cluster_culling_shader': tab_indent = 8
        groups = group_by_indent(text, tab_indent)

        extension_names = []
        for match in re.finditer(r'\s+#extension (\w+)\s*:\s*<\w+>', text):
            extension_names.append(text[match.start(1):match.end(1)])

        implicit_extensions = [
            'GL_KHR_vulkan_glsl.txt',
            'GL_EXT_vulkan_glsl_relaxed.txt',
            'GLSL_EXT_shader_subgroup_extended_types.txt',
        ]

        if len(extension_names) == 0:
            if filename not in implicit_extensions:
                print('missing extension names:', name)
                exit(0)

        if filename in implicit_extensions: return

        prototypes = []
        vardecls = []

        for section in find_matching_groups(groups, re.compile(r'chapter 7|section 7\.\d+', re.IGNORECASE)):
            section_text = '\n'.join(flatten(section))
            for match in re.finditer(r'(' + modifier_regex + r'\s+)+\w+\s+gl_\w+(\s*=\s*\w+)?;', section_text, re.MULTILINE):
                match_text = section_text[match.start():match.end()]
                vardecls.append(parse_variable(match_text))

        pattern = re.compile('(' + '|'.join([
                r'variables?\s+<?gl_\w+>?',
                r'<?gl_\w+(\?\?\w+)?>?variables?\s+',
            ]) + ')', re.IGNORECASE)
        for section in find_matching_groups(groups, pattern):
            while isinstance(section[-1], list) or section[-1] == '':
                section = section[:-1]

            i = 0
            while i < len(section):
                if section[i] == '' or section[i].isspace():
                    i += 1
                    continue

                start = i
                while i < len(section) and section[i] != '': i += 1
                text = '\n'.join(section[start:i])

                match = re.search(r'(variables?)?(,?(\s+and)?\s+<?gl_\w+(\?\?\w+)?>?)+(\s+(variables?|is\s+available))?', text)
                if match is None: continue

                matches = re.findall('(gl_\w+(\?\?\w+)?)', match.group())
                if len(matches) == 0: i += 1; continue
                names = [match[0] for match in matches]

                if '??' in names[0]:
                    variants = ['Eq', 'Ge', 'Gt', 'Le', 'Lt']
                    names = [names[0].replace('??', v) for v in variants]

                for name in names:
                    found = False
                    for var in vardecls:
                        if var['name'] == name:
                            found = True
                            if 'description' not in var:
                                var['description'] = [text]
                    if not found:
                        print('unknown variable:', name)

        for section in find_matching_groups(groups, re.compile(r'chapter 8|section 8\.\d+', re.IGNORECASE)):
            for prototype in find_prototypes('\n'.join(flatten(section))):
                if prototype['name'].startswith('imageAtomic'): continue
                prototypes.append(prototype)

        for section in find_matching_groups(groups, re.compile(r'.*(the\s+function\s+\w+\([^()]*\)).*', re.IGNORECASE)):
            while (isinstance(section[-1], list)
                   or section[-1] == ''
                   or re.match(r'^syntax:', section[-1].strip(), re.IGNORECASE) is not None):
                section = section[:-1]

            text = '\n'.join(section)
            for match in re.finditer(r'(the\s+function|and)\s+(\w+)\([^()]*\)', text, re.IGNORECASE | re.MULTILINE):
                name = text[match.start(2):match.end(2)]
                found = False
                for proto in prototypes:
                    if proto['name'] == name:
                        found = True
                        if 'description' not in proto:
                            proto['description'] = text.split('\n\n')
                if not found:
                    print('unknown function:', name)


        for proto in prototypes + vardecls:
            required_extensions = extension_names

            if 'description' not in proto:
                print('missing documentation:', proto['name'])
            else:
                desc = proto['description']
                for item in desc:
                    match = re.match(r'Only usable if the extension (\w+) is enabled', item, re.IGNORECASE)
                    if match is not None:
                        required_extensions = [item[match.start(1):match.end(1)]]

                proto['description'] = [escape_code(p) for p in desc]

            if filename not in implicit_extensions:
                if len(required_extensions) == 0:
                    print('missing extension:', proto['name'])
                else:
                    proto['extensions'] = required_extensions

        functions.extend(prototypes)
        variables.extend(vardecls)


# Given a string, attempts to find and escape markdown code snippets
def escape_code(text):
    result = ''
    last = 0
    for match in re.finditer(r'<?\w+(\([^()]*\))?>?', text):
        result += text[last:match.start()]
        word = match.group()
        if word[0] == '<' or '_' in word or word[-1] == ')':
            result += '`' + word.strip('<>') + '`'
        else:
            result += word
        last = match.end()

    result += text[last:]
    return result


output = sys.argv[1]

scriptdir = os.path.dirname(sys.argv[0]) or '.'
extension_files = [f for f in iglob(f'{scriptdir}/GLSL/extensions/*/*.txt')]
docs_files = [f for f in iglob(f'{scriptdir}/docs.gl/sl4/*.xhtml')]
glsl_html_spec = f'{scriptdir}/GLSLangSpec.4.60.html'

work = 0
total_work = len(extension_files) + len(docs_files) + 1

def progress(info):
    global work
    print(f'{work}/{total_work}: {info}')
    work += 1

progress(glsl_html_spec)
process_glsl_html_spec(glsl_html_spec)

for i, path in enumerate(docs_files):
    progress(path)
    process_docs_gl_file(path)

for i, path in enumerate(extension_files):
    progress(path)
    process_extension_file(path)

variables.append({
    'modifiers': 'in',
    'type': 'int',
    'name': 'gl_VertexIndex',
    'description': [
        ' '.join("""The variable `gl_VertexIndex` is a vertex language input variable that
        holds an integer index for the vertex, relative to a base.  While the
        variable `gl_VertexIndex` is always present, its value is not always
        defined.""".split())
    ],
    'versions': [450],
})

variables.append({
    'modifiers': 'in',
    'type': 'int',
    'name': 'gl_InstanceIndex',
    'description': [
        ' '.join("""The variable `gl_InstanceIndex` is a vertex language input variable that
        holds the instance number of the current primitive in an instanced draw
        call, relative to a base. If the current primitive does not come from
        an instanced draw call, the value of `gl_InstanceIndex` is zero.""".split())
    ],
    'versions': [450],
})


keywords.sort(key=lambda x: x['name'])
operators.sort(key=lambda x: x['name'])
types.sort(key=lambda x: x['name'])
variables.sort(key=lambda x: x['name'])
functions.sort(key=lambda x: x['name'])

with open(output, 'w') as f:
    f.write(json.dumps({
        'comment': 'generated from docs.gl',
        'keywords': keywords,
        'operators': operators,
        'types': types,
        'variables': variables,
        'functions': functions,
    }, indent=2, ensure_ascii=False))

progress('done')
No results found.