// Copyright 2014 The Flutter Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // The whole design for the lexing and parsing step can be found in this design doc. // See https://flutter.dev/go/icu-message-parser. // Symbol Types import '../base/logger.dart'; import 'gen_l10n_types.dart'; enum ST { // Terminal Types openBrace, closeBrace, comma, equalSign, other, plural, select, string, number, identifier, empty, // Nonterminal Types message, placeholderExpr, pluralExpr, pluralParts, pluralPart, selectExpr, selectParts, selectPart, } // The grammar of the syntax. Map<ST, List<List<ST>>> grammar = <ST, List<List<ST>>>{ ST.message: <List<ST>>[ <ST>[ST.string, ST.message], <ST>[ST.placeholderExpr, ST.message], <ST>[ST.pluralExpr, ST.message], <ST>[ST.selectExpr, ST.message], <ST>[ST.empty], ], ST.placeholderExpr: <List<ST>>[ <ST>[ST.openBrace, ST.identifier, ST.closeBrace], ], ST.pluralExpr: <List<ST>>[ <ST>[ST.openBrace, ST.identifier, ST.comma, ST.plural, ST.comma, ST.pluralParts, ST.closeBrace], ], ST.pluralParts: <List<ST>>[ <ST>[ST.pluralPart, ST.pluralParts], <ST>[ST.empty], ], ST.pluralPart: <List<ST>>[ <ST>[ST.identifier, ST.openBrace, ST.message, ST.closeBrace], <ST>[ST.equalSign, ST.number, ST.openBrace, ST.message, ST.closeBrace], <ST>[ST.other, ST.openBrace, ST.message, ST.closeBrace], ], ST.selectExpr: <List<ST>>[ <ST>[ST.openBrace, ST.identifier, ST.comma, ST.select, ST.comma, ST.selectParts, ST.closeBrace], <ST>[ST.other, ST.openBrace, ST.message, ST.closeBrace], ], ST.selectParts: <List<ST>>[ <ST>[ST.selectPart, ST.selectParts], <ST>[ST.empty], ], ST.selectPart: <List<ST>>[ <ST>[ST.identifier, ST.openBrace, ST.message, ST.closeBrace], <ST>[ST.number, ST.openBrace, ST.message, ST.closeBrace], <ST>[ST.other, ST.openBrace, ST.message, ST.closeBrace], ], }; class Node { Node(this.type, this.positionInMessage, { this.expectedSymbolCount = 0, this.value, List<Node>? children }): children = children ?? <Node>[]; // Token constructors. Node.openBrace(this.positionInMessage): type = ST.openBrace, value = '{'; Node.closeBrace(this.positionInMessage): type = ST.closeBrace, value = '}'; Node.brace(this.positionInMessage, String this.value) { if (value == '{') { type = ST.openBrace; } else if (value == '}') { type = ST.closeBrace; } else { // We should never arrive here. throw L10nException('Provided value $value is not a brace.'); } } Node.equalSign(this.positionInMessage): type = ST.equalSign, value = '='; Node.comma(this.positionInMessage): type = ST.comma, value = ','; Node.string(this.positionInMessage, String this.value): type = ST.string; Node.number(this.positionInMessage, String this.value): type = ST.number; Node.identifier(this.positionInMessage, String this.value): type = ST.identifier; Node.pluralKeyword(this.positionInMessage): type = ST.plural, value = 'plural'; Node.selectKeyword(this.positionInMessage): type = ST.select, value = 'select'; Node.otherKeyword(this.positionInMessage): type = ST.other, value = 'other'; Node.empty(this.positionInMessage): type = ST.empty, value = ''; String? value; late ST type; List<Node> children = <Node>[]; int positionInMessage; int expectedSymbolCount = 0; @override String toString() { return _toStringHelper(0); } String _toStringHelper(int indentLevel) { final String indent = List<String>.filled(indentLevel, ' ').join(); if (children.isEmpty) { return ''' ${indent}Node($type, $positionInMessage${value == null ? '' : ", value: '$value'"})'''; } final String childrenString = children.map((Node child) => child._toStringHelper(indentLevel + 1)).join(',\n'); return ''' ${indent}Node($type, $positionInMessage${value == null ? '' : ", value: '$value'"}, children: <Node>[ $childrenString, $indent])'''; } // Only used for testing. We don't compare expectedSymbolCount because // it is an auxiliary member used during the parse function but doesn't // have meaning after calling compress. @override // ignore: avoid_equals_and_hash_code_on_mutable_classes, hash_and_equals bool operator==(covariant Node other) { if(value != other.value || type != other.type || positionInMessage != other.positionInMessage || children.length != other.children.length ) { return false; } for (int i = 0; i < children.length; i++) { if (children[i] != other.children[i]) { return false; } } return true; } bool get isFull { return children.length >= expectedSymbolCount; } } RegExp escapedString = RegExp(r"'[^']*'"); RegExp unescapedString = RegExp(r"[^{}']+"); RegExp normalString = RegExp(r'[^{}]+'); RegExp brace = RegExp(r'{|}'); RegExp whitespace = RegExp(r'\s+'); RegExp numeric = RegExp(r'[0-9]+'); RegExp alphanumeric = RegExp(r'[a-zA-Z0-9|_]+'); RegExp comma = RegExp(r','); RegExp equalSign = RegExp(r'='); // List of token matchers ordered by precedence Map<ST, RegExp> matchers = <ST, RegExp>{ ST.empty: whitespace, ST.number: numeric, ST.comma: comma, ST.equalSign: equalSign, ST.identifier: alphanumeric, }; class Parser { Parser( this.messageId, this.filename, this.messageString, { this.useEscaping = false, this.logger } ); final String messageId; final String messageString; final String filename; final bool useEscaping; final Logger? logger; static String indentForError(int position) { return '${List<String>.filled(position, ' ').join()}^'; } // Lexes the message into a list of typed tokens. General idea is that // every instance of "{" and "}" toggles the isString boolean and every // instance of "'" toggles the isEscaped boolean (and treats a double // single quote "''" as a single quote "'"). When !isString and !isEscaped // delimit tokens by whitespace and special characters. List<Node> lexIntoTokens() { final List<Node> tokens = <Node>[]; bool isString = true; // Index specifying where to match from int startIndex = 0; // At every iteration, we should be able to match a new token until we // reach the end of the string. If for some reason we don't match a // token in any iteration of the loop, throw an error. while (startIndex < messageString.length) { Match? match; if (isString) { if (useEscaping) { // This case is slightly involved. Essentially, wrapping any syntax in // single quotes escapes the syntax except when there are consecutive pair of single // quotes. For example, "Hello! 'Flutter''s amazing'. { unescapedPlaceholder }" // converts the '' in "Flutter's" to a single quote for convenience, since technically, // we should interpret this as two strings 'Flutter' and 's amazing'. To get around this, // we also check if the previous character is a ', and if so, add a single quote at the beginning // of the token. match = escapedString.matchAsPrefix(messageString, startIndex); if (match != null) { final String string = match.group(0)!; if (string == "''") { tokens.add(Node.string(startIndex, "'")); } else if (startIndex > 1 && messageString[startIndex - 1] == "'") { // Include a single quote in the beginning of the token. tokens.add(Node.string(startIndex, string.substring(0, string.length - 1))); } else { tokens.add(Node.string(startIndex, string.substring(1, string.length - 1))); } startIndex = match.end; continue; } match = unescapedString.matchAsPrefix(messageString, startIndex); if (match != null) { tokens.add(Node.string(startIndex, match.group(0)!)); startIndex = match.end; continue; } } else { match = normalString.matchAsPrefix(messageString, startIndex); if (match != null) { tokens.add(Node.string(startIndex, match.group(0)!)); startIndex = match.end; continue; } } match = brace.matchAsPrefix(messageString, startIndex); if (match != null) { tokens.add(Node.brace(startIndex, match.group(0)!)); isString = false; startIndex = match.end; continue; } // Theoretically, we only reach this point because of unmatched single quotes because // 1. If it begins with single quotes, then we match the longest string contained in single quotes. // 2. If it begins with braces, then we match those braces. // 3. Else the first character is neither single quote or brace so it is matched by RegExp "unescapedString" throw L10nParserException( 'ICU Lexing Error: Unmatched single quotes.', filename, messageId, messageString, startIndex, ); } else { RegExp matcher; ST? matchedType; // Try to match tokens until we succeed for (matchedType in matchers.keys) { matcher = matchers[matchedType]!; match = matcher.matchAsPrefix(messageString, startIndex); if (match != null) { break; } } if (match == null) { match = brace.matchAsPrefix(messageString, startIndex); if (match != null) { tokens.add(Node.brace(startIndex, match.group(0)!)); isString = true; startIndex = match.end; continue; } // This should only happen when there are special characters we are unable to match. throw L10nParserException( 'ICU Lexing Error: Unexpected character.', filename, messageId, messageString, startIndex ); } else if (matchedType == ST.empty) { // Do not add whitespace as a token. startIndex = match.end; continue; } else if (<ST>[ST.identifier].contains(matchedType) && tokens.last.type == ST.openBrace) { // Treat any token as identifier if it comes right after an open brace, whether it's a keyword or not. tokens.add(Node(ST.identifier, startIndex, value: match.group(0))); startIndex = match.end; continue; } else { // Handle keywords separately. Otherwise, lexer will assume parts of identifiers may be keywords. final String tokenStr = match.group(0)!; switch(tokenStr) { case 'plural': matchedType = ST.plural; break; case 'select': matchedType = ST.select; break; case 'other': matchedType = ST.other; break; } tokens.add(Node(matchedType!, startIndex, value: match.group(0))); startIndex = match.end; continue; } } } return tokens; } Node parseIntoTree() { final List<Node> tokens = lexIntoTokens(); final List<ST> parsingStack = <ST>[ST.message]; final Node syntaxTree = Node(ST.empty, 0, expectedSymbolCount: 1); final List<Node> treeTraversalStack = <Node>[syntaxTree]; // Helper function for parsing and constructing tree. void parseAndConstructNode(ST nonterminal, int ruleIndex) { final Node parent = treeTraversalStack.last; final List<ST> grammarRule = grammar[nonterminal]![ruleIndex]; // When we run out of tokens, just use -1 to represent the last index. final int positionInMessage = tokens.isNotEmpty ? tokens.first.positionInMessage : -1; final Node node = Node(nonterminal, positionInMessage, expectedSymbolCount: grammarRule.length); parsingStack.addAll(grammarRule.reversed); // For tree construction, add nodes to the parent until the parent has all // the children it is expecting. parent.children.add(node); if (parent.isFull) { treeTraversalStack.removeLast(); } treeTraversalStack.add(node); } while (parsingStack.isNotEmpty) { final ST symbol = parsingStack.removeLast(); // Figure out which production rule to use. switch(symbol) { case ST.message: if (tokens.isEmpty) { parseAndConstructNode(ST.message, 4); } else if (tokens[0].type == ST.closeBrace) { parseAndConstructNode(ST.message, 4); } else if (tokens[0].type == ST.string) { parseAndConstructNode(ST.message, 0); } else if (tokens[0].type == ST.openBrace) { if (3 < tokens.length && tokens[3].type == ST.plural) { parseAndConstructNode(ST.message, 2); } else if (3 < tokens.length && tokens[3].type == ST.select) { parseAndConstructNode(ST.message, 3); } else { parseAndConstructNode(ST.message, 1); } } else { // Theoretically, we can never get here. throw L10nException('ICU Syntax Error.'); } break; case ST.placeholderExpr: parseAndConstructNode(ST.placeholderExpr, 0); break; case ST.pluralExpr: parseAndConstructNode(ST.pluralExpr, 0); break; case ST.pluralParts: if (tokens.isNotEmpty && ( tokens[0].type == ST.identifier || tokens[0].type == ST.other || tokens[0].type == ST.equalSign ) ) { parseAndConstructNode(ST.pluralParts, 0); } else { parseAndConstructNode(ST.pluralParts, 1); } break; case ST.pluralPart: if (tokens.isNotEmpty && tokens[0].type == ST.identifier) { parseAndConstructNode(ST.pluralPart, 0); } else if (tokens.isNotEmpty && tokens[0].type == ST.equalSign) { parseAndConstructNode(ST.pluralPart, 1); } else if (tokens.isNotEmpty && tokens[0].type == ST.other) { parseAndConstructNode(ST.pluralPart, 2); } else { throw L10nParserException( 'ICU Syntax Error: Plural parts must be of the form "identifier { message }" or "= number { message }"', filename, messageId, messageString, tokens[0].positionInMessage, ); } break; case ST.selectExpr: parseAndConstructNode(ST.selectExpr, 0); break; case ST.selectParts: if (tokens.isNotEmpty && ( tokens[0].type == ST.identifier || tokens[0].type == ST.number || tokens[0].type == ST.other )) { parseAndConstructNode(ST.selectParts, 0); } else { parseAndConstructNode(ST.selectParts, 1); } break; case ST.selectPart: if (tokens.isNotEmpty && tokens[0].type == ST.identifier) { parseAndConstructNode(ST.selectPart, 0); } else if (tokens.isNotEmpty && tokens[0].type == ST.number) { parseAndConstructNode(ST.selectPart, 1); } else if (tokens.isNotEmpty && tokens[0].type == ST.other) { parseAndConstructNode(ST.selectPart, 2); } else { throw L10nParserException( 'ICU Syntax Error: Select parts must be of the form "identifier { message }"', filename, messageId, messageString, tokens[0].positionInMessage ); } break; // At this point, we are only handling terminal symbols. // ignore: no_default_cases default: final Node parent = treeTraversalStack.last; // If we match a terminal symbol, then remove it from tokens and // add it to the tree. if (symbol == ST.empty) { parent.children.add(Node.empty(-1)); } else if (tokens.isEmpty) { throw L10nParserException( 'ICU Syntax Error: Expected "${terminalTypeToString[symbol]}" but found no tokens.', filename, messageId, messageString, messageString.length + 1, ); } else if (symbol == tokens[0].type) { final Node token = tokens.removeAt(0); parent.children.add(token); } else { throw L10nParserException( 'ICU Syntax Error: Expected "${terminalTypeToString[symbol]}" but found "${tokens[0].value}".', filename, messageId, messageString, tokens[0].positionInMessage, ); } if (parent.isFull) { treeTraversalStack.removeLast(); } } } return syntaxTree.children[0]; } final Map<ST, String> terminalTypeToString = <ST, String>{ ST.openBrace: '{', ST.closeBrace: '}', ST.comma: ',', ST.empty: '', ST.identifier: 'identifier', ST.number: 'number', ST.plural: 'plural', ST.select: 'select', ST.equalSign: '=', ST.other: 'other', }; // Compress the syntax tree. Note that after // parse(lex(message)), the individual parts (ST.string, ST.placeholderExpr, // ST.pluralExpr, and ST.selectExpr) are structured as a linked list See diagram // below. This // function compresses these parts into a single children array (and does this // for ST.pluralParts and ST.selectParts as well). Then it checks extra syntax // rules. Essentially, it converts // // Message // / \ // PluralExpr Message // / \ // String Message // / \ // SelectExpr ... // // to // // Message // / | \ // PluralExpr String SelectExpr ... // // Keep in mind that this modifies the tree in place and the values of // expectedSymbolCount and isFull is no longer useful after this operation. Node compress(Node syntaxTree) { Node node = syntaxTree; final List<Node> children = <Node>[]; switch (syntaxTree.type) { case ST.message: case ST.pluralParts: case ST.selectParts: while (node.children.length == 2) { children.add(node.children[0]); compress(node.children[0]); node = node.children[1]; } syntaxTree.children = children; break; // ignore: no_default_cases default: node.children.forEach(compress); } return syntaxTree; } // Takes in a compressed syntax tree and checks extra rules on // plural parts and select parts. void checkExtraRules(Node syntaxTree) { final List<Node> children = syntaxTree.children; switch(syntaxTree.type) { case ST.pluralParts: // Must have an "other" case. if (children.every((Node node) => node.children[0].type != ST.other)) { throw L10nParserException( 'ICU Syntax Error: Plural expressions must have an "other" case.', filename, messageId, messageString, syntaxTree.positionInMessage ); } // Identifier must be one of "zero", "one", "two", "few", "many". for (final Node node in children) { final Node pluralPartFirstToken = node.children[0]; const List<String> validIdentifiers = <String>['zero', 'one', 'two', 'few', 'many']; if (pluralPartFirstToken.type == ST.identifier && !validIdentifiers.contains(pluralPartFirstToken.value)) { throw L10nParserException( 'ICU Syntax Error: Plural expressions case must be one of "zero", "one", "two", "few", "many", or "other".', filename, messageId, messageString, node.positionInMessage, ); } } break; case ST.selectParts: if (children.every((Node node) => node.children[0].type != ST.other)) { throw L10nParserException( 'ICU Syntax Error: Select expressions must have an "other" case.', filename, messageId, messageString, syntaxTree.positionInMessage, ); } break; // ignore: no_default_cases default: break; } children.forEach(checkExtraRules); } Node parse() { try { final Node syntaxTree = compress(parseIntoTree()); checkExtraRules(syntaxTree); return syntaxTree; } on L10nParserException catch (error) { // For debugging purposes. if (logger == null) { rethrow; } logger?.printError(error.toString()); return Node(ST.empty, 0, value: ''); } } }