Улучшение производительности парсера Java

-1

У меня есть парсер, написанный в файле .jjt. Это очень похоже на this. В моем случае единственное изменение - это включение моих собственных методов оценки выражения. В настоящее время для анализа используется 1 миллисекунда для 1 выражения. Мне нужно улучшить производительность этого анализатора. Я использовал VisualVM для профилирования и выяснил, что из 44,5 секунд, которые потребовались для запуска моего кода, который состоял из чтения строк файла в ArrayList<String> и оценки 93 выражений (которые имели значения параметров, полученные из строк файла в ArrayList<String>), используя мой парсер, около 43 секунд был проведен в методе parseStream. Я упомянул this link для улучшения моего анализатора, я также попытался установить ERROR_REPORTING вариант FALSE, но это не помогло.Улучшение производительности парсера Java

EDIT 1:

Вот файл parser.jjt. я профилированный мое заявление на 1000 строк в файл и метод, который занял большую часть времени было Start()

options { 
    JAVA_UNICODE_ESCAPE = true; 
    MULTI = true; 
    VISITOR = true; 
    VISITOR_EXCEPTION = "ParseException"; 
    NODE_DEFAULT_VOID = true; 
// NODE_PACKAGE = "org.nfunk.jep.parser"; 
// BUILD_NODE_FILES=false; 
    STATIC = false; 
// DEBUG_TOKEN_MANAGER = true; 
// DEBUG_PARSER = true; 
// DEBUG_LOOKAHEAD = true; 
} 


/*************************************************************** 
PARSER BEGIN 
***************************************************************/ 

PARSER_BEGIN(Parser) 
package org.nfunk.jep; 

import java.util.Vector; 
import org.nfunk.jep.function.*; 
import org.nfunk.jep.type.*; 

public class Parser { 
    private JEP  jep; 
    private SymbolTable symTab; 
    private OperatorSet opSet; 
    private int initialTokenManagerState = DEFAULT; 

public Node parseStream(java.io.Reader stream, JEP jep_in) 
         throws ParseException { 
    restart(stream,jep_in); 
    // Parse the expression, and return the 
    enable_tracing(); 
    Node node = Start(); 
    if (node == null) throw new ParseException("No expression entered"); 
    return node.jjtGetChild(0); 
} 

/** 
* Restart the parse with the given stream. 
* @since 2.3.0 beta 1 
*/ 
public void restart(java.io.Reader stream, JEP jep_in) 
{ 
    ReInit(stream); 
    this.token_source.SwitchTo(initialTokenManagerState); 
    jep = jep_in; 
    symTab = jep.getSymbolTable(); 
    opSet = jep.getOperatorSet(); 
} 
/** 
* Continue parsing without re-initilising stream. 
* Allows renetrancy of parser so that strings like 
* "x=1; y=2; z=3;" can be parsed. 
* When a semi colon is encountered parsing finishes leaving the rest of the string unparsed. 
* Parsing can be resumed from the current position by using this method. 
* For example 
* <pre> 
* XJep j = new XJep(); 
* Parser parse = j.getParse(); 
* StringReader sr = new StringReader("x=1; y=2; z=3;"); 
* parse.restart(sr,j); 
* Node node; 
* try { 
* while((node = j.continueParse())!=null) { 
* j.println(node); 
* } }catch(ParseException e) {} 
* </pre> 
*/ 
public Node continueParse() throws ParseException 
{ 
    ASTStart node = Start(); 
    if (node==null) return null; 
    return node.jjtGetChild(0); 
} 

private void addToErrorList(String errorStr) { 
    jep.errorList.addElement(errorStr);  
} 

/** 
* Sets the initial state that the token manager is in. 
* Can be used to change how x.x is interpreted, either as a single 
* identifier (DEFAULT) or as x <DOT> x (NO_DOT_IN_IDENTIFIERS) 
* @param state the state to be in. Currently the only legal values are DEFAULT and NO_DOT_IN_IDENTIFIER 
*/ 
public void setInitialTokenManagerState(int state) 
{ 
    initialTokenManagerState = state; 
} 
/** 
* Translate all escape sequences to characters. Inspired by Rob Millar's 
* unescape() method in rcm.util.Str fron the Web Sphinx project. 
* 
* @param inputStr String containing escape characters. 
* @return String with all escape sequences replaced. 
*/ 
private String replaceEscape(String inputStr) { 
    int len = inputStr.length(); 
    int p = 0; 
    int i; 
    String metachars = "tnrbf\\\"'"; 
    String chars = "\t\n\r\b\f\\\"'"; 

    StringBuffer output = new StringBuffer(); 

    while ((i = inputStr.indexOf('\\', p)) != -1) { 
     output.append(inputStr.substring(p, i)); 

     if (i+1 == len) break; 

     // find metacharacter 
     char metac = inputStr.charAt(i+1); 

     // find the index of the metac 
     int k = metachars.indexOf(metac); 
     if (k == -1) { 
      // didn't find the metachar, leave sequence as found. 
      // This code should be unreachable if the parser 
      // is functioning properly because strings containing 
      // unknown escape characters should not be accepted. 
      output.append('\\'); 
      output.append(metac); 
     } else { 
      // its corresponding true char 
      output.append(chars.charAt(k)); 
     } 

     // skip over both escape character & metacharacter 
     p = i + 2; 
    } 

    // add the end of the input string to the output 
    if (p < len) 
     output.append(inputStr.substring(p)); 

    return output.toString(); 
} 
} 

PARSER_END(Parser) 

/*************************************************************** 
SKIP 
***************************************************************/ 

<*> SKIP : 
{ 
    " " 
    | "\t" 
    | "\n" 
    | "\r" 

    | <"//" (~["\n","\r"])* ("\n"|"\r"|"\r\n")> 
    | <"/*" (~["*"])* "*" (~["/"] (~["*"])* "*")* "/"> 
} 


/*************************************************************** 
TOKENS 
***************************************************************/ 

<*> TOKEN : /* LITERALS */ 
{ 
    < INTEGER_LITERAL: 
     <DECIMAL_LITERAL> 
    > 
| 
    < #DECIMAL_LITERAL: ["0"-"9"] (["0"-"9"])* > 
| 
    < FLOATING_POINT_LITERAL: 
     (["0"-"9"])+ "." (["0"-"9"])* (<EXPONENT>)? 
     | "." (["0"-"9"])+ (<EXPONENT>)? 
     | (["0"-"9"])+ <EXPONENT> 
    > 
| 
    < #EXPONENT: ["e","E"] (["+","-"])? (["0"-"9"])+ > 
| 
    < STRING_LITERAL: 
     "\"" 
     ((~["\"","\\","\n","\r"]) 
     | ("\\" ["n","t","b","r","f","\\","'","\""]) 
     )* 
     "\"" 
    > 
} 

/* IDENTIFIERS 

    Letters before version 2.22 
    < #LETTER: ["_","a"-"z","A"-"Z"] > 

    In Ver 2.3.0.1 presence of . in an identifier is switchable. 
    In the DEFAULT lexical state identifiers can contain a . 
    In the NO_DOT_IN_IDENTIFIERS state identifiers cannot contain a . 
    the state can be set by using 
    Parser.setInitialTokenManagerState 
*/ 

<DEFAULT> TOKEN: 
{ 
    <INDENTIFIER1: <LETTER1>(<LETTER1>|<DIGIT1>|".")*> 
    | 
    < #LETTER1: 
    [ 
     "\u0024",   // $ 
     "\u0041"-"\u005a", // A - Z 
     "\u005f",   // _ 
     "\u0061"-"\u007a", // a - z 
     "\u00c0"-"\u00d6", // Upper case symbols of Latin-1 Supplement 
     "\u00d8"-"\u00f6", // Lower case symbols of Latin-1 Supplement 
     "\u00f8"-"\u00ff", // More lower case symbols of Latin-1 Supplement 
     "\u0100"-"\u1fff", // Many languages (including Greek) 
     "\u3040"-"\u318f", // Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo 
     "\u3300"-"\u337f", // CJK Compatibility 
     "\u3400"-"\u3d2d", // CJK Unified Ideographs Extension A 
     "\u4e00"-"\u9fff", // CJK Unified Ideographs 
     "\uf900"-"\ufaff" // CJK Compatibility Ideographs 
    ] 
    > 
| 
    < #DIGIT1: ["0"-"9"] > 
} 

<NO_DOT_IN_IDENTIFIERS> TOKEN: 
{ 
    <INDENTIFIER2: <LETTER2>(<LETTER2>|<DIGIT2>)*> 
    | 
    < #LETTER2: 
    [ 
     "\u0024",   // $ 
     "\u0041"-"\u005a", // A - Z 
     "\u005f",   // _ 
     "\u0061"-"\u007a", // a - z 
     "\u00c0"-"\u00d6", // Upper case symbols of Latin-1 Supplement 
     "\u00d8"-"\u00f6", // Lower case symbols of Latin-1 Supplement 
     "\u00f8"-"\u00ff", // More lower case symbols of Latin-1 Supplement 
     "\u0100"-"\u1fff", // Many languages (including Greek) 
     "\u3040"-"\u318f", // Hiragana, Katakana, Bopomofo, Hangul Compatibility Jamo 
     "\u3300"-"\u337f", // CJK Compatibility 
     "\u3400"-"\u3d2d", // CJK Unified Ideographs Extension A 
     "\u4e00"-"\u9fff", // CJK Unified Ideographs 
     "\uf900"-"\ufaff" // CJK Compatibility Ideographs 
    ] 
    > 
    | 
    < #DIGIT2: ["0"-"9"] > 
} 

/* OPERATORS */ 
<*> TOKEN: 
{ 
    < ASSIGN:"=" > // rjm 
| < SEMI: ";" > // rjm 
| < COMMA: "," > // rjm 
| < GT: ">" > 
| < LT: "<" > 
| < EQ: "==" > 
| < LE: "<=" > 
| < GE: ">=" > 
| < NE: "!=" > 
| < AND: "&&" > 
| < OR: "||" > 
| < PLUS: "+" > 
| < MINUS:"-" > 
| < MUL: "*" > 
| < DOT: "." > // rjm 
| < DIV: "/" > 
| < MOD: "%" > 
| < NOT: "!" > 
| < POWER:"^" > 
| < CROSS:"^^" > // rjm 
| < LSQ: "[" > // rjm 
| < RSQ: "]" > // rjm 
| < LRND: "(" > // rjm 
| < RRND: ")" > // rjm 
| < COLON: ":" > // rjm 
} 


/*************************************************************** 
GRAMMAR START 
***************************************************************/ 

ASTStart Start() #Start : 
{ 
} 
{ 
    Expression() (<EOF> | <SEMI>) { return jjtThis; } 
    | (<EOF> | <SEMI>) 
    { 
     // njf - The next line is commented out in 2.3.0 since 
     //  two "No expression entered" errors are reported 
     //  in EvaluatorVisitor and Console (one from here 
     //  the other from ParseStream()) 
     //  Decided to just return null, and handle the error 
     //  in ParseStream. 
     // addToErrorList("No expression entered"); 
     return null; 
    } 
} 

// Expresions can be like 
// x=3 
// x=y=3 parsed as x=(y=3) 

void Expression() : {} 
{ 
    LOOKAHEAD(LValue() <ASSIGN>)  // need to prevent javacc warning with left recusion 
    AssignExpression() // rjm changes from OrExpresion 
    | 
    RightExpression() 
} 

void AssignExpression() : {} // rjm addition 
{ 

    (LValue() <ASSIGN> Expression() 
     { 
      if (!jep.getAllowAssignment()) throw new ParseException(
      "Syntax Error (assignment not enabled)"); 

      jjtThis.setOperator(opSet.getAssign()); 
     } 
    ) 
     #FunNode(2) 
} 

void RightExpression() : 
{ 
} 
{ 
    OrExpression() 
} 

void OrExpression() : 
{ 
} 
{ 
    AndExpression() 
    (
     (<OR> AndExpression() 
     { 
      jjtThis.setOperator(opSet.getOr()); 
     } 
    ) #FunNode(2) 
    )* 
} 


void AndExpression() : 
{ 
} 
{ 
    EqualExpression() 
    (
     (<AND> EqualExpression() 
     { 
      jjtThis.setOperator(opSet.getAnd()); 
     } 
    ) #FunNode(2) 
    )* 
} 



void EqualExpression() : 
{ 
} 
{ 
    RelationalExpression() 
    (
     (<NE> RelationalExpression() 
     { 
     jjtThis.setOperator(opSet.getNE()); 
     } 
    ) #FunNode(2) 
    | 
     (<EQ> RelationalExpression() 
     { 
      jjtThis.setOperator(opSet.getEQ()); 
     } 
    ) #FunNode(2) 
    )* 
} 



void RelationalExpression() : 
{ 
} 
{ 
    AdditiveExpression() 
    (
    (<LT> AdditiveExpression() 
     { 
     jjtThis.setOperator(opSet.getLT()); 
     } 
    ) #FunNode(2) 
    | 
    (<GT> AdditiveExpression() 
     { 
     jjtThis.setOperator(opSet.getGT()); 
     } 
    ) #FunNode(2) 
    | 
    (<LE> AdditiveExpression() 
     { 
     jjtThis.setOperator(opSet.getLE()); 
     } 
    ) #FunNode(2) 
    | 
    (<GE> AdditiveExpression() 
     { 
     jjtThis.setOperator(opSet.getGE()); 
     } 
    ) #FunNode(2) 
)* 
} 


void AdditiveExpression() : 
{ 
} 
{ 
    MultiplicativeExpression() 
    (
    (<PLUS> MultiplicativeExpression() 
     { 
     jjtThis.setOperator(opSet.getAdd()); 
     } 
    ) #FunNode(2) 
    | 
    (<MINUS> MultiplicativeExpression() 
     { 
     jjtThis.setOperator(opSet.getSubtract()); 
     } 
    ) #FunNode(2) 
)* 
} 


void MultiplicativeExpression() : 
{ 
} 
{ 
    UnaryExpression() 
    (
    (  
     PowerExpression() 
     { 
     if (!jep.implicitMul) throw new ParseException(
      "Syntax Error (implicit multiplication not enabled)"); 

     jjtThis.setOperator(opSet.getMultiply()); 
     } 
    ) #FunNode(2) 
    | 
    (<MUL> UnaryExpression() 
     { 
     jjtThis.setOperator(opSet.getMultiply()); 
     } 
    ) #FunNode(2) 
    | 
    (<DOT> UnaryExpression() 
     { 
     jjtThis.setOperator(opSet.getDot()); 
     } 
    ) #FunNode(2) 
    | 
    (<CROSS> UnaryExpression() 
     { 
     jjtThis.setOperator(opSet.getCross()); 
     } 
    ) #FunNode(2) 
    | 
    (<DIV> UnaryExpression() 
     { 
     jjtThis.setOperator(opSet.getDivide()); 
     } 
    ) #FunNode(2) 
    | 
    (<MOD> UnaryExpression() 
     { 
     jjtThis.setOperator(opSet.getMod()); 
     } 
    ) #FunNode(2) 
)* 
} 


void UnaryExpression() : 
{ 
} 
{ 
    (<PLUS> UnaryExpression()) 
| 
    (<MINUS> UnaryExpression() 
    { 
     jjtThis.setOperator(opSet.getUMinus()); 
    } 
) #FunNode(1) 
| 
    (<NOT> UnaryExpression() 
    { 
     jjtThis.setOperator(opSet.getNot()); 
    } 
) #FunNode(1) 
| 
    PowerExpression() 
} 


void PowerExpression() : 
{ 
} 
{ 
    UnaryExpressionNotPlusMinus() 
    [ 
    (<POWER> UnaryExpression() 
    { 
     jjtThis.setOperator(opSet.getPower()); 
    } 
) #FunNode(2) 
    ] 
} 


void UnaryExpressionNotPlusMinus() : 
{ 
    String identString = ""; 
    int type; 
} 
{ 
    AnyConstant() 
    | 
    LOOKAHEAD(ArrayAccess()) 
    ArrayAccess() 
    | 
    LOOKAHEAD({ (getToken(1).kind == INDENTIFIER1 || getToken(1).kind == INDENTIFIER2) && 
        jep.funTab.containsKey(getToken(1).image) }) 
    Function() 
    | 
    Variable() 
    | 
    <LRND> Expression() <RRND> 
    | 
// LOOKAHEAD(<LSQ> Expression() <COLON>) 
// RangeExpression() 
// | 
    ListExpression() 
} 

void ListExpression() #FunNode: 
{ 
    jjtThis.setOperator(opSet.getList()); 
} 
{ 
    <LSQ> Expression() (<COMMA> Expression())* <RSQ> 
} 

/* 
void RangeExpression() #FunNode: 
{ 
    jjtThis.setOperator(opSet.getRange()); 
} 
{ 
    <LSQ> Expression() (<COLON> Expression())+ <RSQ> 
} 
*/ 

void LValue() : 
{ 
} 
{ 
    LOOKAHEAD(ArrayAccess()) 
    ArrayAccess() 
    | Variable() 
} 

void ArrayAccess() : 
{ 
} 
{ 
    Variable() ListExpression() 
    { 
    jjtThis.setOperator(opSet.getElement()); 
    } #FunNode(2) 

} 
void Variable() : 
{ 
    String identString = ""; 
} 
{ 
    (identString = Identifier() 
    { 
     if (symTab.containsKey(identString)) { 
      jjtThis.setVar(symTab.getVar(identString)); 
     } else { 
      if (jep.allowUndeclared) { 
       jjtThis.setVar(symTab.makeVarIfNeeded(identString)); 
      } else { 
       addToErrorList("Unrecognized symbol \"" + identString +"\""); 
      } 
     } 
    } 
    ) #VarNode 
} 



void Function() : 
{ 
    int reqArguments = 0; 
    String identString = ""; 
} 
{ 
    (identString = Identifier() 
     { 
      if (jep.funTab.containsKey(identString)) { 
       //Set number of required arguments 
       reqArguments = 
        ((PostfixMathCommandI)jep.funTab.get(identString)).getNumberOfParameters(); 
       jjtThis.setFunction(identString, 
        (PostfixMathCommandI)jep.funTab.get(identString)); 
      } else { 
       addToErrorList("!!! Unrecognized function \"" + identString +"\""); 
      } 
     } 

     <LRND> ArgumentList(reqArguments, identString) <RRND> 

    ) #FunNode 
} 

void ArgumentList(int reqArguments, String functionName) : 
{ 
    int count = 0; 
    String errorStr = ""; 
} 
{ 
    [ 
    Expression() { count++; } 
    (
     <COMMA> 
     Expression() { count++; } 
    )* 
    ] 
    { 
     if(reqArguments == -1) { 
      if(!((PostfixMathCommandI)jep.funTab.get(functionName)).checkNumberOfParameters(count)) 
      { 
       errorStr = "Function \"" + functionName +"\" illegal number of arguments " + count; 
       addToErrorList(errorStr); 
      } 
     } 
     else if (reqArguments != count) { 
      errorStr = "Function \"" + functionName +"\" requires " 
         + reqArguments + " parameter"; 
      if (reqArguments!=1) errorStr += "s"; 
      addToErrorList(errorStr); 
     } 
    } 
} 



String Identifier() : 
{ 
    Token t; 
} 
{ 
    (t = <INDENTIFIER1> | t = <INDENTIFIER2>) { return t.image; } 
} 


void AnyConstant() #Constant: 
{ 
    Token t; 
    Object value; 
} 
{ 
    t=<STRING_LITERAL> { 
     // strip away double quotes at end of string 
     String temp = (t.image).substring(1,t.image.length()-1); 

     // replace escape characters 
     temp = replaceEscape(temp); 

     jjtThis.setValue(temp); 
    } 
    | 
    value = RealConstant() { 
     jjtThis.setValue(value); 
// } 
// | 
// value = Array() { 
//  jjtThis.setValue(value); 
    } 
} 

/* 
Vector Array() : 
{ 
    Object value; 
    Vector result = new Vector(); 
} 
{ 
    <LSQ> 
    value = RealConstant() 
    { 
     result.addElement(value); 
    } 
    (
     <COMMA> 
     value = RealConstant() 
     { 
      result.addElement(value); 
     } 
    )* 
    <RSQ> 
    { 
     return result; 
    } 
} 
*/ 




Object RealConstant() : 
{ 
    Token t; 
    Object value; 
} 
{ 
    (t=<INTEGER_LITERAL> | t=<FLOATING_POINT_LITERAL>) 
    { 
     try { 
      value = jep.getNumberFactory().createNumber(t.image); 
     } catch (Exception e) { 
      value = null; 
      addToErrorList("Can't parse \"" + t.image + "\""); 
     } 

     return value; 
    } 
}

источник

2016-07-18 shalakha

сообщение код, который вы написали. – Jokab

Если вы не сделали что-то незримо неправильное в своей невидимой грамматике, то исчезает маловероятно, что производительность парсера является настоящей проблемой здесь. Сколько миллионов выражений вам действительно нужно разбирать в секунду? – EJP

Сохранение всего файла в памяти, конечно же, не поможет производительности. Почему бы не разобрать строки по мере их чтения? – VGR

На профилировании следует смотреть на узких местах, ReadStream стоит больше всего времени, но на какую функцию (s)? Некоторые профилировщики наглядно рисуют эти критические точки доступа.

То, что произошло мое внимание было replaceEscape

private static final String METACHARS = "tnrbf\\\"'"; 
private static final String CHARS = "\t\n\r\b\f\\\"'"; 

private String replaceEscape(String inputStr) { 
    int i = inputStr.indexOf('\\'); 
    if (i == -1) { // 1. Heuristic strings without backslash 
     return inputStr; 
    } 
    int len = inputStr.length(); 
    int p = 0; 
    StringBuilder output = new StringBuilder(); // 2. Faster StringBuilder 

    while (i != -1) { 
     if (i + 1 == len) break; 

     if (p < i) output.append(inputStr.substring(p, i)); 
     p = i + 1; 
     char metac = inputStr.charAt(i+1); 

     // find the index of the metac 
     int k = METACHARS.indexOf(metac); 
     if (k != -1) { 
      // its corresponding true char 
      metac = CHARS.charAt(k)); 
      ++p; // Start copying after metachar 
     } 
     output.append(metac); 
    } 

    // add the end of the input string to the output 
    if (p < len) 
     output.append(inputStr.substring(p)); 
    return output.toString(); 
}

источник

2016-07-19 07:20:58

любое предложение относительно того, как я могу улучшить его? – shalakha

Отслеживать разбор. Я этого не видел, но, может быть, грамматика в какой-то момент неэффективна. Например, некоторые поддеревья AST могут быть сведены к узлу. Структуры данных, такие как таблица символов, могут быть субоптимальными. Контролировать разные грамматические обычаи (быстрее без переменных)? Используйте 'Files.readAllLines'. Используйте BufferedReader. –

Улучшение производительности парсера Java

ответ

Смежные вопросы