1
Fork 0
glsl_analyzer/spec/gen_spec.py
2023-12-06 21:04:53 +01:00

650 lines
21 KiB
Python
Executable file

#!/usr/bin/env python3
import sys
import os
from glob import iglob
import json
from bs4 import BeautifulSoup
import tokenize
import re
keywords = []
types = []
operators = []
variables = []
functions = []
def process_glsl_html_spec(path):
with open(path, 'r') as f:
soup = BeautifulSoup(f, 'html.parser')
keywords_section = soup.find(id='keywords').parent
for index, dl in enumerate(keywords_section.find_all('dl')):
names = [t.getText() for t in dl.find_all('strong')]
kind = 'glsl'
if index == 1: kind = 'vulkan'
if index == 2: kind = 'reserved'
for name in names:
keywords.append({
'name': name,
'kind': kind,
})
basic_types_section = soup.find(id='basic-types').parent
for table in basic_types_section.find_all('table'):
headers = table.find_all('th')
if headers[0].getText() != 'Type': continue
if headers[1].getText() != 'Meaning': continue
rows = table.find('tbody').find_all('tr')
for row in rows:
name_cell, meaning_cell = row.find_all('td')
names = name_cell.getText().splitlines()
meaning = ' '.join(meaning_cell.getText().split())
for name in names:
types.append({
'name': name,
'description': [meaning],
})
operators_section = soup.find(id='operators').parent
operator_table = operators_section.find('table')
headers = operator_table.find_all('th')
assert headers[0].getText() == 'Precedence'
assert headers[1].getText() == 'Operator Class'
assert headers[2].getText() == 'Operators'
assert headers[3].getText() == 'Associativity'
rows = operator_table.find('tbody').find_all('tr')
ignored_operators = ['(', ')', '[', ']', '.', ',']
for row in rows:
precedence, operator_class, operator_words, associativity = row.find_all('td')
precedence_number = int(precedence.getText().split()[0])
left_to_right = associativity.getText() == 'Left to Right'
operator_class = operator_class.getText()
kind = 'infix'
if 'prefix' in operator_class: kind = 'prefix'
if 'post fix' in operator_class: kind = 'postfix'
for word in operator_words.getText().split():
if word in ignored_operators: continue
operators.append({
'name': word,
'precedence': precedence_number,
'left_to_right': left_to_right,
'kind': kind,
})
def process_docs_gl_file(path):
is_variable = os.path.basename(path).startswith('gl_')
with open(path, 'r') as f:
soup = BeautifulSoup(str(f.read()), 'html.parser')
desc_node = soup.find(id='description')
if desc_node is None: return
paragraphs = desc_node.find_all('p')
description = [paragraph_to_markdown(p) for p in paragraphs]
versions_table = soup.find(id='versions')
versions_header = versions_table.find('thead').find_all('tr')[-1]
versions = [int(v.getText().replace('.', '')) for v in versions_header.find_all('th')[1:]]
versions_body = versions_table.find('tbody').find_all('tr')
version_support = []
for row in versions_body:
datas = row.find_all('td')
supported = [v for v, data in zip(versions, datas[1:]) if data.getText().strip() != '-']
version_support.append(supported)
if is_variable:
for node in soup.find_all(attrs={'class':'fieldsynopsis'}):
variable = parse_variable(node.getText())
variable['description'] = description
variable['versions'] = version_support[0]
variables.append(variable)
if 'gl_Position' in path:
variable = { 'name': 'gl_Position', 'type': 'vec4' }
variable['description'] = description
variable['versions'] = version_support[0]
variables.append(variable)
else:
for i, node in enumerate(soup.find_all(attrs={'class':'funcprototype-table'})):
prototype = parse_prototype(node.getText())
prototype['description'] = description
prototype['versions'] = version_support[min(i, len(version_support) - 1)]
functions.append(prototype)
def paragraph_to_markdown(paragraph):
if paragraph.math is not None and paragraph.math.mtable is not None:
return '```\n' + expand_math(paragraph.math.mtable).replace('δ ', 'δ') + '\n```\n'
for tag in paragraph.find_all('em'):
tag.replace_with('_' + tag.getText() + '_')
for tag in paragraph.find_all('code'):
tag.replace_with('`' + tag.getText() + '`')
for tag in paragraph.find_all('math'):
tag.replace_with('`' + escape_math(tag) + '`')
return ' '.join(paragraph.getText().split())
def math_children(node):
children = []
for child in node.children:
if child.name is None: continue
children.append(child)
return children
def escape_math(node):
return ' '.join(expand_math(node).split(" \t\r")).replace('δ ', 'δ')
def expand_math(node):
if node.name is None or node.name in ['mi', 'mn', 'mo']:
text = node.getText().strip()
if text == '\u2061': return ''
return text
if node.name == 'mrow' or node.name == 'math':
return ' '.join([expand_math(c) for c in node.children])
if node.name == 'mfrac':
parts = math_children(node)
return f'{expand_math(parts[0])} / {expand_math(parts[1])}'
if node.name == 'mfenced':
open = node['open']
close = node['close']
return open + ' '.join([expand_math(c) for c in node.children]).strip() + close
if node.name == 'msup':
parts = math_children(node)
return expand_math(parts[0]) + '**' + expand_math(parts[1])
if node.name == 'msub':
parts = math_children(node)
return expand_math(parts[0]) + '_' + expand_math(parts[1])
if node.name == 'msubsup':
parts = math_children(node)
return (expand_math(parts[0])
+ '_' + expand_math(parts[1])
+ '^' + expand_math(parts[2]))
if node.name == 'mtable':
rows = node.find_all('mtr')
res = ''
for row in rows:
cols = row.find_all('mtd')
for col in cols:
res += ' '.join([expand_math(c) for c in col.children]) + ' '
res += '\n'
return res
if node.name == 'msqrt':
inner = ' '.join([expand_math(c) for c in math_children(node)])
return f'sqrt({inner})'
raise Exception(f'unknown math node {node.name}: {node}')
modifier_keywords = ['out', 'in', 'inout', 'const', 'highp', 'lowp', 'perprimitiveEXT']
modifier_regex = '(' + '|'.join(modifier_keywords) + ')'
type_regex = r'(void|int|float|double|bool|\w*(vec|mat|gen\w*Type)\w*)'
def parse_variable(text):
tokens = tokenize(text)
variable = {}
i = 0
while tokens[i] in modifier_keywords:
if 'modifiers' in variable: variable['modifiers'] += ' ' + tokens[i]
else: variable['modifiers'] = tokens[i]
i += 1
variable['type'] = tokens[i]; i += 1
variable['name'] = tokens[i]; i += 1
if tokens[i] == '[':
start = i
while tokens[i] != ']': i += 1
i += 1
variable['type'] += ''.join(tokens[start:i])
if tokens[i] == '=':
i += 1
start = i
while tokens[i] != ';': i += 1
variable['default_value'] = ' '.join(tokens[start:i])
assert tokens[i] == ';'
return variable
def parse_prototype(text):
tokens = tokenize(text)
func_output = tokens[0]
func_name = tokens[1]
i = 2
while i < len(tokens) and tokens[i] != '(':
func_output = func_name
func_name = tokens[i]
i += 1
i += 1
parameters = []
while i < len(tokens) - 2:
parameter = {}
optional = tokens[i] == '['
if optional:
i += 1
parameter['optional'] = True
while tokens[i] in modifier_keywords:
if 'modifiers' in parameter: parameter['modifiers'] += ' ' + tokens[i]
else: parameter['modifiers'] = tokens[i]
i += 1
param_type = tokens[i]
if param_type == 'void': break
parameter['type'] = param_type
i += 1
param_name = tokens[i]
if param_name[0].isalnum():
parameter['name'] = param_name
i += 1
if tokens[i] == '[':
array_start = i
while i < len(tokens) and tokens[i] != ']':
i += 1
i += 1
parameter['type'] += ''.join(tokens[array_start:i])
parameters.append(parameter)
if optional: assert tokens[i] == ']'; i += 1
if tokens[i] == ',': i += 1
assert tokens[-1] == ')' or (tokens[-2] == ')' and tokens[-1] == ';')
return {
'return_type':func_output,
'name':func_name,
'parameters':parameters,
}
def tokenize(text):
i = 0
N = len(text)
tokens = []
while i < N:
if text[i].isspace(): i += 1; continue
if text[i].isalnum():
start = i
i += 1
while i < N and (text[i].isalnum() or text[i] == '_'): i += 1
tokens.append(text[start:i])
continue
tokens.append(text[i])
i += 1;
return tokens
def group_by_indent(text: str, tab_indent=None):
stack = [[]]
indents = [0]
indentation = 4
for line in text.splitlines():
i = 0
for ch in line:
if ch == ' ': i += 1; continue
if ch == '\t': i += tab_indent or indentation; continue
break
line = line[i:]
if i % 4 == 2:
indentation = 2
# Round to nearest multiple of indentation to account for minor errors
i = int(round(i / indentation) * indentation)
if len(line) == 0:
stack[-1].append('')
continue
while i < indents[-1]:
last = stack.pop()
stack[-1].append(last)
indents.pop()
if i == indents[-1] or len(line) == 0:
stack[-1].append(line)
continue
if i > indents[-1]:
stack.append([line])
indents.append(i)
continue
while len(stack) > 1:
last = stack.pop()
stack[-1].append(last)
return stack[0]
def find_matching_groups(groups, pattern, flags=0):
start = None
last_end = 0
text = ''
for i, item in enumerate(groups):
if isinstance(item, str):
text += '\n' + item
if re.search(pattern, text, flags) is None: continue
start = last_end
else:
text = ''
last_end = i+1
if start is not None:
yield groups[start:i+1]
start = None
continue
for match in find_matching_groups(item, pattern, flags):
yield match
if start is not None:
yield groups[start:]
def flatten(xs):
for x in xs:
if isinstance(x, list):
for value in flatten(x): yield value
else:
yield x
def find_prototypes(text):
syntaxes = []
descriptions = []
for match in re.finditer(r'(\s*(\+(-*\+)+|\|([^|]*\|)+))+', text, re.MULTILINE):
table_text = text[match.start():match.end()].strip()
header = True
for i, line in enumerate(table_text.splitlines()):
line = line.strip()
if line[0] == '+':
if i != 0:
syntaxes.append('')
descriptions.append('')
continue
pieces = line.split('|')
if len(pieces) != 4: break
if header:
if re.match(r'\s*(syntax|function)\s*', pieces[1], re.IGNORECASE) is None: break
if re.match(r'\s*description\s*', pieces[2], re.IGNORECASE) is None: break
header = False
continue
syntaxes[-1] += pieces[1]
descriptions[-1] += pieces[2] + '\n'
for i in range(len(syntaxes)):
syntaxes[i] = ' '.join(syntaxes[i].strip().split())
descriptions[i] = descriptions[i].strip().split('\n\n')
syntaxes.append(text)
descriptions.append(None)
found = set()
for syntax, description in zip(syntaxes, descriptions):
for match in re.finditer(r'((\w+\s+)+\w+\([^()]*\))', syntax, re.MULTILINE):
match_text = syntax[match.start(1):match.end(1)]
if '|' in match_text: continue
proto = parse_prototype(match_text)
if proto['name'] in found: continue
found.add(proto['name'])
if description is not None:
proto['description'] = description
return_type = proto['return_type']
if re.search(type_regex, return_type) is not None:
yield proto
else:
known_invalid_pattern = r'(functions?|of|enable|the|and|to|call|with|if|as|in)'
if re.match(known_invalid_pattern, return_type, re.IGNORECASE) is None:
print('unknown return type:', match_text)
def process_extension_file(path):
filename = os.path.basename(path)
with open(path, 'r') as f:
text = str(f.read())
tab_indent = 4
if filename == 'GL_HUAWEI_cluster_culling_shader': tab_indent = 8
groups = group_by_indent(text, tab_indent)
extension_names = []
for match in re.finditer(r'\s+#extension (\w+)\s*:\s*<\w+>', text):
extension_names.append(text[match.start(1):match.end(1)])
implicit_extensions = [
'GL_KHR_vulkan_glsl.txt',
'GL_EXT_vulkan_glsl_relaxed.txt',
'GLSL_EXT_shader_subgroup_extended_types.txt',
]
if len(extension_names) == 0:
if filename not in implicit_extensions:
print('missing extension names:', name)
exit(0)
if filename in implicit_extensions: return
prototypes = []
vardecls = []
for section in find_matching_groups(groups, re.compile(r'chapter 7|section 7\.\d+', re.IGNORECASE)):
section_text = '\n'.join(flatten(section))
for match in re.finditer(r'(' + modifier_regex + r'\s+)+\w+\s+gl_\w+(\s*=\s*\w+)?;', section_text, re.MULTILINE):
match_text = section_text[match.start():match.end()]
vardecls.append(parse_variable(match_text))
pattern = re.compile('(' + '|'.join([
r'variables?\s+<?gl_\w+>?',
r'<?gl_\w+(\?\?\w+)?>?variables?\s+',
]) + ')', re.IGNORECASE)
for section in find_matching_groups(groups, pattern):
while isinstance(section[-1], list) or section[-1] == '':
section = section[:-1]
i = 0
while i < len(section):
if section[i] == '' or section[i].isspace():
i += 1
continue
start = i
while i < len(section) and section[i] != '': i += 1
text = '\n'.join(section[start:i])
match = re.search(r'(variables?)?(,?(\s+and)?\s+<?gl_\w+(\?\?\w+)?>?)+(\s+(variables?|is\s+available))?', text)
if match is None: continue
matches = re.findall('(gl_\w+(\?\?\w+)?)', match.group())
if len(matches) == 0: i += 1; continue
names = [match[0] for match in matches]
if '??' in names[0]:
variants = ['Eq', 'Ge', 'Gt', 'Le', 'Lt']
names = [names[0].replace('??', v) for v in variants]
for name in names:
found = False
for var in vardecls:
if var['name'] == name:
found = True
if 'description' not in var:
var['description'] = [text]
if not found:
print('unknown variable:', name)
for section in find_matching_groups(groups, re.compile(r'chapter 8|section 8\.\d+', re.IGNORECASE)):
for prototype in find_prototypes('\n'.join(flatten(section))):
if prototype['name'].startswith('imageAtomic'): continue
prototypes.append(prototype)
for section in find_matching_groups(groups, re.compile(r'.*(the\s+function\s+\w+\([^()]*\)).*', re.IGNORECASE)):
while (isinstance(section[-1], list)
or section[-1] == ''
or re.match(r'^syntax:', section[-1].strip(), re.IGNORECASE) is not None):
section = section[:-1]
text = '\n'.join(section)
for match in re.finditer(r'(the\s+function|and)\s+(\w+)\([^()]*\)', text, re.IGNORECASE | re.MULTILINE):
name = text[match.start(2):match.end(2)]
found = False
for proto in prototypes:
if proto['name'] == name:
found = True
if 'description' not in proto:
proto['description'] = text.split('\n\n')
if not found:
print('unknown function:', name)
for proto in prototypes + vardecls:
required_extensions = extension_names
if 'description' not in proto:
print('missing documentation:', proto['name'])
else:
desc = proto['description']
for item in desc:
match = re.match(r'Only usable if the extension (\w+) is enabled', item, re.IGNORECASE)
if match is not None:
required_extensions = [item[match.start(1):match.end(1)]]
proto['description'] = [escape_code(p) for p in desc]
if filename not in implicit_extensions:
if len(required_extensions) == 0:
print('missing extension:', proto['name'])
else:
proto['extensions'] = required_extensions
functions.extend(prototypes)
variables.extend(vardecls)
# Given a string, attempts to find and escape markdown code snippets
def escape_code(text):
result = ''
last = 0
for match in re.finditer(r'<?\w+(\([^()]*\))?>?', text):
result += text[last:match.start()]
word = match.group()
if word[0] == '<' or '_' in word or word[-1] == ')':
result += '`' + word.strip('<>') + '`'
else:
result += word
last = match.end()
result += text[last:]
return result
output = sys.argv[1]
scriptdir = os.path.dirname(sys.argv[0]) or '.'
extension_files = [f for f in iglob(f'{scriptdir}/GLSL/extensions/*/*.txt')]
docs_files = [f for f in iglob(f'{scriptdir}/docs.gl/sl4/*.xhtml')]
glsl_html_spec = f'{scriptdir}/GLSLangSpec.4.60.html'
work = 0
total_work = len(extension_files) + len(docs_files) + 1
def progress(info):
global work
print(f'{work}/{total_work}: {info}')
work += 1
progress(glsl_html_spec)
process_glsl_html_spec(glsl_html_spec)
for i, path in enumerate(docs_files):
progress(path)
process_docs_gl_file(path)
for i, path in enumerate(extension_files):
progress(path)
process_extension_file(path)
variables.append({
'modifiers': 'in',
'type': 'int',
'name': 'gl_VertexIndex',
'description': [
' '.join("""The variable `gl_VertexIndex` is a vertex language input variable that
holds an integer index for the vertex, relative to a base. While the
variable `gl_VertexIndex` is always present, its value is not always
defined.""".split())
],
'versions': [450],
})
variables.append({
'modifiers': 'in',
'type': 'int',
'name': 'gl_InstanceIndex',
'description': [
' '.join("""The variable `gl_InstanceIndex` is a vertex language input variable that
holds the instance number of the current primitive in an instanced draw
call, relative to a base. If the current primitive does not come from
an instanced draw call, the value of `gl_InstanceIndex` is zero.""".split())
],
'versions': [450],
})
keywords.sort(key=lambda x: x['name'])
operators.sort(key=lambda x: x['name'])
types.sort(key=lambda x: x['name'])
variables.sort(key=lambda x: x['name'])
functions.sort(key=lambda x: x['name'])
with open(output, 'w') as f:
f.write(json.dumps({
'comment': 'generated from docs.gl',
'keywords': keywords,
'operators': operators,
'types': types,
'variables': variables,
'functions': functions,
}, indent=2, ensure_ascii=False))
progress('done')