scanner.rs 8.5 KB
Newer Older
P
Phodal Huang 已提交
1
use onig::{Regex};
P
Phodal Huang 已提交
2
use unicode_segmentation::UnicodeSegmentation;
P
Phodal Huang 已提交
3 4


P
Phodal Huang 已提交
5
#[derive(Debug, Clone, Serialize)]
P
Phodal Huang 已提交
6
pub struct IOnigCaptureIndex {
P
Phodal Huang 已提交
7 8 9
    pub start: usize,
    pub end: usize,
    pub length: usize,
P
Phodal Huang 已提交
10 11
}

P
Phodal Huang 已提交
12
#[derive(Debug, Clone, Serialize)]
P
Phodal Huang 已提交
13
pub struct IOnigMatch {
P
Phodal Huang 已提交
14
    pub index: usize,
P
Phodal Huang 已提交
15 16 17
    pub capture_indices: Vec<IOnigCaptureIndex>,
}

P
Phodal Huang 已提交
18
#[derive(Debug, Clone)]
P
Phodal Huang 已提交
19
pub struct Scanner {
P
Phodal Huang 已提交
20
    pub index: usize,
P
Phodal Huang 已提交
21 22 23 24 25 26 27 28 29 30 31
    pub patterns: Vec<String>,
}

impl Scanner {
    pub fn new(patterns: Vec<String>) -> Self {
        Scanner {
            index: 0,
            patterns,
        }
    }

P
Phodal Huang 已提交
32 33 34 35
    pub fn dispose(&mut self) {
        self.index = 0
    }

36
    pub fn find_next_match_sync(&mut self, origin_str: String, start_position: i32) -> Option<IOnigMatch> {
P
Phodal Huang 已提交
37
        if self.index >= self.patterns.clone().len() {
P
Phodal Huang 已提交
38
            self.index = 0;
39 40 41
            return None;
        }

P
Phodal Huang 已提交
42
        let mut after_pos_str = String::from("");
43
        let mut start_pos = start_position;
P
Phodal Huang 已提交
44
        let mut string_vec = origin_str.graphemes(true).collect::<Vec<&str>>();
45

P
Phodal Huang 已提交
46
        let mut has_utf8 = false;
P
Phodal Huang 已提交
47
        if string_vec.len() != origin_str.len() {
P
Phodal Huang 已提交
48 49 50
            has_utf8 = true;
        }

P
Phodal Huang 已提交
51
        if start_pos > string_vec.len() as i32 {
52 53
            return None;
        }
P
Phodal Huang 已提交
54

P
Phodal Huang 已提交
55
        if start_pos < 0 {
56
            start_pos = 0
P
Phodal Huang 已提交
57
        }
P
Phodal Huang 已提交
58

P
Phodal Huang 已提交
59 60 61
        let before_vec = string_vec[..start_pos as usize].to_owned();
        let after_vec = string_vec[start_pos as usize..].to_owned();

P
Phodal Huang 已提交
62
        // println!("before: {:?}, after_vec: {:?}", before_vec, after_vec);
P
Phodal Huang 已提交
63 64
        for x in after_vec {
            after_pos_str = after_pos_str + x
P
Phodal Huang 已提交
65 66
        }

P
Phodal Huang 已提交
67 68 69 70
        let pattern = self.patterns[self.index].clone();

        let regex = Regex::new(pattern.as_str()).unwrap();
        let mut capture_indices = vec![];
P
Phodal Huang 已提交
71
        let _captures = regex.captures(after_pos_str.as_str());
P
Phodal Huang 已提交
72

P
Phodal Huang 已提交
73 74 75
        if let Some(captures) = _captures {
            for (_, pos) in captures.iter_pos().enumerate() {
                if let Some((start, end)) = pos {
P
Phodal Huang 已提交
76 77
                    let length = end - start;

P
Phodal Huang 已提交
78
                    let mut capture = IOnigCaptureIndex {
P
Phodal Huang 已提交
79 80
                        start: start_pos as usize + start,
                        end: start_pos as usize + end,
P
Phodal Huang 已提交
81
                        length,
P
Phodal Huang 已提交
82
                    };
P
Phodal Huang 已提交
83 84

                    if has_utf8 {
P
Phodal Huang 已提交
85 86 87
                        let x1 = after_pos_str.split_at(end).0;
                        let utf8_end = before_vec.len() + x1.graphemes(true).collect::<Vec<&str>>().len() + 1;
                        let utf8_start = utf8_end - length;
P
Phodal Huang 已提交
88 89

                        capture = IOnigCaptureIndex {
P
Phodal Huang 已提交
90 91
                            start: utf8_start,
                            end: utf8_end,
P
Phodal Huang 已提交
92 93 94 95
                            length,
                        };
                    }

P
Phodal Huang 已提交
96
                    capture_indices.push(capture)
P
Phodal Huang 已提交
97
                }
P
Phodal Huang 已提交
98
            }
P
Phodal Huang 已提交
99
        }
P
Phodal Huang 已提交
100

P
Phodal Huang 已提交
101
        if capture_indices.len() <= 0 {
P
Phodal Huang 已提交
102
            self.index = self.index + 1;
103
            self.find_next_match_sync(origin_str.clone(), start_pos)
P
Phodal Huang 已提交
104 105
        } else {
            let index = self.index.clone();
P
Phodal Huang 已提交
106
            self.index = 0;
P
Phodal Huang 已提交
107 108 109 110
            Some(IOnigMatch {
                index,
                capture_indices,
            })
P
Phodal Huang 已提交
111
        }
P
Phodal Huang 已提交
112 113 114 115 116 117
    }
}


#[cfg(test)]
mod tests {
P
Phodal Huang 已提交
118
    use crate::scanner::scanner::{Scanner, IOnigMatch};
P
Phodal Huang 已提交
119 120 121 122

    #[test]
    fn should_handle_simple_regex() {
        let regex = vec![String::from("ell"), String::from("wo")];
P
Phodal Huang 已提交
123 124
        let mut scanner = Scanner::new(regex);
        let s = String::from("Hello world!");
P
Phodal Huang 已提交
125
        let result = scanner.find_next_match_sync(s.clone(), 0).unwrap();
P
Phodal Huang 已提交
126 127 128 129
        assert_eq!(result.index, 0);
        assert_eq!(result.capture_indices[0].start, 1);
        assert_eq!(result.capture_indices[0].end, 4);

P
Phodal Huang 已提交
130
        let second_result = scanner.find_next_match_sync(s, 2).unwrap();
P
Phodal Huang 已提交
131 132 133
        assert_eq!(second_result.index, 1);
        assert_eq!(second_result.capture_indices[0].start, 6);
        assert_eq!(second_result.capture_indices[0].end, 8);
P
Phodal Huang 已提交
134
    }
P
Phodal Huang 已提交
135 136 137 138 139 140

    #[test]
    fn should_handle_simple2() {
        let regex = vec![String::from("a"), String::from("b"), String::from("c")];
        let mut scanner = Scanner::new(regex);

P
Phodal Huang 已提交
141 142 143 144 145
        if let None = scanner.find_next_match_sync(String::from("x"), 0) {
            assert_eq!(true, true);
        } else {
            assert_eq!(true, false);
        }
P
Phodal Huang 已提交
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160

        let result = scanner.find_next_match_sync(String::from("xxaxxbxxc"), 0).unwrap();
        assert_eq!(serde_json::to_string(&result).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":2,\"end\":3,\"length\":1}]}"));

        let result2 = scanner.find_next_match_sync(String::from("xxaxxbxxc"), 4).unwrap();
        assert_eq!(serde_json::to_string(&result2).unwrap(), String::from("{\"index\":1,\"capture_indices\":[{\"start\":5,\"end\":6,\"length\":1}]}"));

        let result3 = scanner.find_next_match_sync(String::from("xxaxxbxxc"), 7).unwrap();
        assert_eq!(serde_json::to_string(&result3).unwrap(), String::from("{\"index\":2,\"capture_indices\":[{\"start\":8,\"end\":9,\"length\":1}]}"));

        if let None = scanner.find_next_match_sync(String::from("xxaxxbxxc"), 9) {
            assert_eq!(true, true);
        } else {
            assert_eq!(true, false);
        }
P
Phodal Huang 已提交
161
    }
P
Phodal Huang 已提交
162 163 164 165 166 167 168

    #[test]
    fn should_handle_unicode1() {
        let regex = vec![String::from("1"), String::from("2")];
        let mut scanner = Scanner::new(regex);

        let result = scanner.find_next_match_sync(String::from("ab…cde21"), 5).unwrap();
P
Phodal Huang 已提交
169
        assert_eq!(serde_json::to_string(&result).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":8,\"end\":9,\"length\":1}]}"));
P
Phodal Huang 已提交
170 171 172 173 174 175
    }

    #[test]
    fn should_handle_unicode2() {
        let mut scanner2 = Scanner::new(vec![String::from("\"")]);
        let result2 = scanner2.find_next_match_sync(String::from("{\"\": 1}"), 1).unwrap();
P
Phodal Huang 已提交
176
        assert_eq!(serde_json::to_string(&result2).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":2,\"end\":3,\"length\":1}]}"));
P
Phodal Huang 已提交
177
    }
P
Phodal Huang 已提交
178

179 180 181 182 183
    #[test]
    fn should_handle_unicode3() {
        let regex = vec![String::from("Y"), String::from("X")];
        let mut scanner = Scanner::new(regex);
        let result = scanner.find_next_match_sync(String::from("a💻bYX"), 0).unwrap();
P
Phodal Huang 已提交
184
        assert_eq!(serde_json::to_string(&result).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":4,\"end\":5,\"length\":1}]}"));
P
Phodal Huang 已提交
185 186 187 188 189 190 191 192 193 194 195 196 197

        let result1 = scanner.find_next_match_sync(String::from("a💻bYX"), 1).unwrap();
        assert_eq!(serde_json::to_string(&result1).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":4,\"end\":5,\"length\":1}]}"));

        let result2 = scanner.find_next_match_sync(String::from("a💻bYX"), 2).unwrap();
        assert_eq!(serde_json::to_string(&result2).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":4,\"end\":5,\"length\":1}]}"));

        let result3 = scanner.find_next_match_sync(String::from("a💻bYX"), 3).unwrap();
        assert_eq!(serde_json::to_string(&result3).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":4,\"end\":5,\"length\":1}]}"));

        let result4 = scanner.find_next_match_sync(String::from("a💻bYX"), 4).unwrap();
        assert_eq!(serde_json::to_string(&result4).unwrap(), String::from("{\"index\":1,\"capture_indices\":[{\"start\":5,\"end\":6,\"length\":1}]}"));

P
Phodal Huang 已提交
198 199
        // let result5 = scanner.find_next_match_sync(String::from("a💻bYX"), 5).unwrap();
        // assert_eq!(serde_json::to_string(&result5).unwrap(), String::from("{\"index\":1,\"capture_indices\":[{\"start\":4,\"end\":5,\"length\":1}]}"));
200 201
    }

202 203 204 205
    #[test]
    fn should_out_of_bounds() {
        let mut scanner = Scanner::new(vec![String::from("X")]);
        let result = scanner.find_next_match_sync(String::from("X💻X"), -10000).unwrap();
P
Phodal Huang 已提交
206
        assert_eq!(serde_json::to_string(&result).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":1,\"end\":2,\"length\":1}]}"));
207

208 209 210 211 212 213 214 215
        let result2 = scanner.find_next_match_sync(String::from("X💻X"), 10000);
        assert_eq!(format!("{:?}", result2), "None");
    }

    #[test]
    fn should_handle_regex_g() {
        let mut scanner = Scanner::new(vec![String::from("\\G-and")]);
        let result = scanner.find_next_match_sync(String::from("first-and-second"), 0);
216
        assert_eq!(format!("{:?}", result), "None");
217 218 219

        let result2 = scanner.find_next_match_sync(String::from("first-and-second"), 5).unwrap();
        assert_eq!(serde_json::to_string(&result2).unwrap(), String::from("{\"index\":0,\"capture_indices\":[{\"start\":5,\"end\":9,\"length\":4}]}"));
220
    }
P
Phodal Huang 已提交
221
}