builder.py 5.7 KB
Newer Older
H
HypoX64 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# -*- coding: utf-8 -*-
from .lexer import Lexer,Token
from .cell import Cell,PairCell,EPSILON,CCL,EMPTY

lexer = None

def create_nfa(pattern_string):
    """NFA自动机创建入口
    pattern_string  :: RE表达式
    return          :: nfa起始节点
    """
    global lexer
    lexer = Lexer(pattern_string)
    lexer.next()
    nfa_cells_final = PairCell()
    expr(nfa_cells_final)
    return nfa_cells_final.start_node

"""
词法分析按照优先级自顶向下
expr ::= <factor_connect> ("|" factor_connect)*                                  # "|" 加(或)
factor_connect ::= factor | factor factor*                                       # "" 乘(直接连接)
H
HypoX64 已提交
23
factor ::= term | term ("*"|"+")*                                                # "*"闭包  "+"正闭包
H
HypoX64 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
term ::= char | group | "[" char "-" char "]" | "{" char "," char "}" | "."      # 基本单元,终止符
group ::= "("expr")"                                                             # 递归解决括号优先级
"""

def group(nfa_cells):
    """
    递归调用expr(),实现()操作
    """
    if lexer.match(Token.OPEN_PAREN):
        lexer.next()
        expr(nfa_cells)
    if lexer.match(Token.CLOSE_PAREN):
        lexer.next()
    return True

def term(nfa_cells):
    """
    对 . | a (单个字符) | 单个一定范围[a-z]的字符 | {a,b,c} 某些字符集合中的单个字符->相当于(a+b+c)
    """
H
HypoX64 已提交
43
    if lexer.match(Token.L):                # char
H
HypoX64 已提交
44
        nfa_single_char(nfa_cells)
H
HypoX64 已提交
45
    elif lexer.match(Token.ANY):            # .
H
HypoX64 已提交
46
        nfa_any_single_char(nfa_cells)
H
HypoX64 已提交
47
    elif lexer.match(Token.SQUARE_START):   # [" char "-" char "]
H
HypoX64 已提交
48
        nfa_range_single_char(nfa_cells)
H
HypoX64 已提交
49
    elif lexer.match(Token.OPEN_CURLY):     # "{" char "," char "}"
H
HypoX64 已提交
50
        nfa_set_single_char(nfa_cells)
H
HypoX64 已提交
51
    elif lexer.match(Token.OPEN_PAREN):     # "("expr")"
H
HypoX64 已提交
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
        group(nfa_cells)

def factor(nfa_cells):
    """
    实现两种闭包 "*" | "+"
    """
    term(nfa_cells)
    if lexer.match(Token.CLOSURE) or lexer.match(Token.PLUS_CLOSURE):
        nfa_closure(nfa_cells)

def factor_connect(nfa_cells):
    """
    实现乘操作,即连接
    """
    if is_connect_token(lexer.current_token):
        factor(nfa_cells)
    
    while is_connect_token(lexer.current_token):
        new_cells = PairCell()
        factor(new_cells)
        nfa_cells.end_node.next_1 = new_cells.start_node
        nfa_cells.end_node = new_cells.end_node

    return True


def expr(nfa_cells):
    """
    实现OR操作
    """
    factor_connect(nfa_cells) # 第一支路
    
    new_cells = PairCell() # 其他支路
    while lexer.match(Token.OR):
        lexer.next()
        factor_connect(new_cells)

        start = Cell()
        start.next_1 = new_cells.start_node
        start.next_2 = nfa_cells.start_node
        nfa_cells.start_node = start

        end = Cell()
        new_cells.end_node.next_1 = end
        nfa_cells.end_node.next_2 = end
        nfa_cells.end_node = end

    return True

def nfa_single_char(nfa_cells):
    """
H
HypoX64 已提交
103
    L 匹配输入的单个字符
H
HypoX64 已提交
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
    """
    if not lexer.match(Token.L):
        return False

    start = nfa_cells.start_node = Cell()
    nfa_cells.end_node = nfa_cells.start_node.next_1 = Cell()
    
    start.edge = CCL
    start.char_set = set()
    start.char_set.add(lexer.current_text)
    
    lexer.next()
    return True

def nfa_any_single_char(nfa_cells):
    """
H
HypoX64 已提交
120
    . 匹配单个任意字符
H
HypoX64 已提交
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
    """
    if not lexer.match(Token.ANY):
        return False

    start = nfa_cells.start_node = Cell()
    nfa_cells.end_node = nfa_cells.start_node.next_1 = Cell()
    
    start.edge = CCL
    start.char_set = set()
    for i in range(127):
        start.char_set.add(chr(i))

    lexer.next()
    return False

def nfa_range_single_char(nfa_cells):
    """
H
HypoX64 已提交
138
    [a-z] 匹配范围字符集中的单个字符
H
HypoX64 已提交
139
    """
H
HypoX64 已提交
140
    if not lexer.match(Token.SQUARE_START):
H
HypoX64 已提交
141 142 143 144 145 146 147 148 149
        return False
    lexer.next()
    start = nfa_cells.start_node = Cell()
    start.next_1 = nfa_cells.end_node = Cell()

    start.edge = CCL

    # get range char set
    first = ''
H
HypoX64 已提交
150
    while not lexer.match(Token.SQUARE_END):
H
HypoX64 已提交
151 152 153 154 155 156 157 158 159 160 161 162 163 164
        if not lexer.match(Token.DASH):
            first = lexer.current_text
            start.char_set.add(first)
        else:
            lexer.next()
            for c in range(ord(first), ord(lexer.current_text) + 1):
                start.char_set.add(chr(c))
        lexer.next()

    lexer.next()
    return True

def nfa_set_single_char(nfa_cells):
    """
H
HypoX64 已提交
165
    {a,b,c....} 匹配字符集中的单个字符 相当于(a|b|c...)
H
HypoX64 已提交
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
    """
    if not lexer.match(Token.OPEN_CURLY):
        return False

    lexer.next()
    start = nfa_cells.start_node = Cell()
    start.next_1 = nfa_cells.end_node = Cell()
    start.edge = CCL

    while not lexer.match(Token.CLOSE_CURLY):
        if lexer.current_text != ',':
            start.char_set.add(lexer.current_text)
        lexer.next()
    lexer.next()
    return True
 
def nfa_closure(nfa_cells):
    """
    * 闭包操作   以及  + 正闭包操作
    """
    if (not lexer.match(Token.CLOSURE)) and (not lexer.match(Token.PLUS_CLOSURE)):
        return False

    start = Cell()
    end = Cell()
    start.next_1 = nfa_cells.start_node
    if lexer.match(Token.CLOSURE): # +
        start.next_2 = end  # 连接start与end形成shortcut

    nfa_cells.end_node.next_1 = nfa_cells.start_node
    nfa_cells.end_node.next_2 = end

    nfa_cells.start_node = start
    nfa_cells.end_node = end

    lexer.next()
    return True

def is_connect_token(token):
    no_connect = [
        Token.CLOSE_PAREN,
        Token.EOS,
        Token.CLOSURE,
        Token.PLUS_CLOSURE,
        Token.CLOSE_CURLY,
H
HypoX64 已提交
211
        Token.SQUARE_END,
H
HypoX64 已提交
212 213 214
        Token.OR,
    ]
    return token not in no_connect