regex

./code/regex/main.cc
 1// g++ -std=c++11 ./main.cc
 2#include <algorithm>
 3#include <cctype>
 4#include <codecvt>
 5#include <iostream>
 6#include <locale>
 7#include <regex>
 8#include <string>
 9
10std::string ToLowerCase(const std::string &s) {
11  std::string ans(s.size(), 0);
12  std::transform(s.begin(), s.end(), ans.begin(),
13                 [](unsigned char c) { return std::tolower(c); });
14  return ans;
15}
16
17std::wstring ToWideString(const std::string &s) {
18  // see
19  // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
20  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
21  return converter.from_bytes(s);
22}
23
24std::string ToString(const std::wstring &s) {
25  // see
26  // https://stackoverflow.com/questions/2573834/c-convert-string-or-char-to-wstring-or-wchar-t
27  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
28  return converter.to_bytes(s);
29}
30
31int32_t main() {
32  std::string s = "你(好)吗,HoW    are You doIng?  包含\"中文\"和'英文'。are "
33                  "you ok? 谢谢!好的;说:一句话…也“可以”";
34  std::cout << s << "\n";
35
36  std::vector<std::pair<std::string, std::string>> replace_str_pairs = {
37      {",", ","}, {";", ";"}, {"。", "."}, {"!", "!"}, {"\\s+", " "},
38  };
39  for (const auto &p : replace_str_pairs) {
40    std::regex re(p.first);
41    s = std::regex_replace(s, re, p.second);
42  }
43  std::cout << s << "\n";
44
45  s = ToLowerCase(s);
46
47  std::cout << s << "\n";
48
49  auto ws = ToWideString(s);
50
51  s = ToString(ws);
52  std::cout << s << "\n";
53
54  // https://en.cppreference.com/w/cpp/regex
55  // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
56  std::string expr =
57      "([;:,.?!'\"\\(\\)“”])|([\\u4e00-\\u9fff]+)|([\\u0000-\\u007f]+)";
58
59  std::wstring wexpr = ToWideString(expr);
60  std::wregex we(wexpr);
61
62  auto begin = std::wsregex_iterator(ws.begin(), ws.end(), we);
63  auto end = std::wsregex_iterator();
64  for (std::wsregex_iterator i = begin; i != end; ++i) {
65    std::wsmatch match = *i;
66    std::wstring match_str = match.str();
67    auto ms = ToString(match_str);
68    uint8_t c = reinterpret_cast<const uint8_t *>(ms.data())[0];
69    if (c < 0x80) {
70      std::cout << "Non-Chinese: " << ms << "\n";
71    } else {
72      std::cout << "Chinese: " << ms << "\n";
73    }
74  }
75
76  return 0;
77}
78/*
79你好吗,HoW    are You doIng?  包含中文和英文。are you ok? 谢谢
80你好吗,HoW are You doIng? 包含中文和英文.are you ok? 谢谢
81你好吗,how are you doing? 包含中文和英文.are you ok? 谢谢
82你好吗,how are you doing? 包含中文和英文.are you ok? 谢谢
83Chinese: 你好吗
84Non-Chinese: ,how are you doing?
85Chinese: 包含中文和英文
86Non-Chinese: .are you ok?
87Chinese: 谢谢
88 */