提交 90ea8636 编写于 作者: xindoo's avatar xindoo

增加math获取匹配字符串集合的方法

上级 4475c7af
......@@ -14,6 +14,7 @@ import java.util.ArrayDeque;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
......@@ -65,7 +66,6 @@ public class Regex {
addedStates.add(((DFAState)dfaGraph.start).getAllStateIds());
while (!queue.isEmpty()) {
State curState = queue.poll();
for (Map.Entry<String, Set<State>> entry : curState.next.entrySet()) {
String key = entry.getKey();
Set<State> nexts = entry.getValue();
......@@ -114,6 +114,7 @@ public class Regex {
edge = getCharSetMatch(reader);
break;
}
// 暂时未支持零宽断言
case '^' : {
break;
}
......@@ -185,6 +186,7 @@ public class Regex {
private static DFAGraph convertNfa2Dfa(NFAGraph nfaGraph) {
DFAGraph dfaGraph = new DFAGraph();
Set<State> startStates = new HashSet<>();
// 用NFA图的起始节点构造DFA的起始节点
startStates.addAll(getNextEStates(nfaGraph.start, new HashSet<>()));
if (startStates.size() == 0) {
startStates.add(nfaGraph.start);
......@@ -192,6 +194,7 @@ public class Regex {
dfaGraph.start = dfaGraph.getOrBuild(startStates);
Queue<DFAState> queue = new LinkedList<>();
Set<State> finishedStates = new HashSet<>();
// 如果BFS的方式从已找到的起始节点遍历并构建DFA
queue.add(dfaGraph.start);
while (!queue.isEmpty()) {
DFAState curState = queue.poll();
......@@ -244,7 +247,7 @@ public class Regex {
reader.next();
break;
} case '{' : {
//
// 暂未支持{}指定重复次数
break;
} default : {
return;
......@@ -252,6 +255,60 @@ public class Regex {
}
}
/**
* 获取[]中表示的字符集,只支持字母 数字
* */
private static String getCharSetMatch(Reader reader) {
String charSet = "";
char ch;
while ((ch = reader.next()) != ']') {
charSet += ch;
}
return charSet;
}
private static int[] getRange(Reader reader) {
String rangeStr = "";
char ch;
while ((ch = reader.next()) != '}') {
if (ch == ' ') {
continue;
}
rangeStr += ch;
}
int[] res = new int[2];
if (!rangeStr.contains(",")) {
res[0] = Integer.parseInt(rangeStr);
res[1] = res[0];
} else {
String[] se = rangeStr.split(",", -1);
res[0] = Integer.parseInt(se[0]);
if (se[1].length() == 0) {
res[1] = Integer.MAX_VALUE;
} else {
res[1] = Integer.parseInt(se[1]);
}
}
return res;
}
// 获取Epsilon可达节点列表
private static Set<State> getNextEStates(State curState, Set<State> stateSet) {
if (!curState.next.containsKey(Constant.EPSILON)) {
return Collections.emptySet();
}
Set<State> res = new HashSet<>();
for (State state : curState.next.get(Constant.EPSILON)) {
if (stateSet.contains(state)) {
continue;
}
res.add(state);
res.addAll(getNextEStates(state, stateSet));
stateSet.add(state);
}
return res;
}
public boolean isMatch(String text) {
return isMatch(text, 0);
}
......@@ -264,12 +321,17 @@ public class Regex {
return isMatch(text, 0, start);
}
private boolean isMatch(String text, int pos, State curNFAState) {
/**
* 匹配过程就是根据输入遍历图的过程, 这里DFA和NFA用了同样的代码, 但实际上因为DFA的特性是不会产生回溯的,
* 所以DFA可以换成非递归的形式
*/
private boolean isMatch(String text, int pos, State curState) {
if (pos == text.length()) {
if (curNFAState.isEndState()) {
if (curState.isEndState()) {
return true;
}
for (State nextState : curNFAState.next.getOrDefault(Constant.EPSILON, Collections.emptySet())) {
for (State nextState : curState.next.getOrDefault(Constant.EPSILON, Collections.emptySet())) {
if (isMatch(text, pos, nextState)) {
return true;
}
......@@ -277,7 +339,7 @@ public class Regex {
return false;
}
for (Map.Entry<String, Set<State>> entry : curNFAState.next.entrySet()) {
for (Map.Entry<String, Set<State>> entry : curState.next.entrySet()) {
String edge = entry.getKey();
if (Constant.EPSILON.equals(edge)) {
for (State nextState : entry.getValue()) {
......@@ -301,56 +363,65 @@ public class Regex {
return false;
}
/**
* 暂时只支持字母 数字
* */
private static String getCharSetMatch(Reader reader) {
String charSet = "";
char ch;
while ((ch = reader.next()) != ']') {
charSet += ch;
}
return charSet;
public List<String> match(String text) {
return match(text, 0);
}
private static int[] getRange(Reader reader) {
String rangeStr = "";
char ch;
while ((ch = reader.next()) != '}') {
if (ch == ' ') {
continue;
}
rangeStr += ch;
}
int[] res = new int[2];
if (!rangeStr.contains(",")) {
res[0] = Integer.parseInt(rangeStr);
res[1] = res[0];
} else {
String[] se = rangeStr.split(",", -1);
res[0] = Integer.parseInt(se[0]);
if (se[1].length() == 0) {
res[1] = Integer.MAX_VALUE;
public List<String> match(String text, int mod) {
int s = 0;
int e = -1;
List<String> res = new LinkedList<>();
while (s != text.length()) {
e = getMatchEnd(text, s, dfaGraph.start);
if (e != -1) {
res.add(text.substring(s, e));
s = e;
} else {
res[1] = Integer.parseInt(se[1]);
s++;
}
}
return res;
}
private static Set<State> getNextEStates(State curState, Set<State> stateSet) {
if (!curState.next.containsKey(Constant.EPSILON)) {
return Collections.emptySet();
// 获取正则表达式在字符串中能匹配到的结尾的位置
private int getMatchEnd(String text, int pos, State curState) {
int end = -1;
if (curState.isEndState()) {
return pos;
}
Set<State> res = new HashSet<>();
for (State state : curState.next.get(Constant.EPSILON)) {
if (stateSet.contains(state)) {
continue;
if (pos == text.length()) {
for (State nextState : curState.next.getOrDefault(Constant.EPSILON, Collections.emptySet())) {
end = getMatchEnd(text, pos, nextState);
if (end != -1) {
return end;
}
}
res.add(state);
res.addAll(getNextEStates(state, stateSet));
stateSet.add(state);
}
return res;
for (Map.Entry<String, Set<State>> entry : curState.next.entrySet()) {
String edge = entry.getKey();
if (Constant.EPSILON.equals(edge)) {
for (State nextState : entry.getValue()) {
end = getMatchEnd(text, pos, nextState);
if (end != -1) {
return end;
}
}
} else {
MatchStrategy matchStrategy = MatchStrategyManager.getStrategy(edge);
if (!matchStrategy.isMatch(text.charAt(pos), edge)) {
continue;
}
// 遍历匹配策略
for (State nextState : entry.getValue()) {
end = getMatchEnd(text, pos + 1, nextState);
if (end != -1) {
return end;
}
}
}
}
return -1;
}
}
package xyz.xindoo.re;
import java.util.List;
public class RegexTest {
public static void main(String[] args) throws Exception {
Regex regex = Regex.compile("a(b|c)*");
Regex regex = Regex.compile("a(b|c)*c");
List<String> res = regex.match("aabacabbbcaccc");
regex.printNfa();
System.out.println("");
regex.printDfa();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册