ワードリストから正規表現のパターンを生成
環境
Visual C++ 2019
概要
ワードリスト(配列)から正規表現のパターンを生成します。
PerlやRubyにはあるけどもc++が見つからなかったので参考を基に作成してみました。
下のコードのワードリストの結果だと (?:tange(?:rine|lo)|pomelo|orange|l(?:ime|emon)|grape(?:fruit)?|citron)
が生成されます。
コード
#pragma once
class RegexpTrie
{
public:
static void Initialize(void);
static void Terminate(void);
static const char* ToRegexPattern(const char** ppWords, size_t size);
};
#define _CRT_SECURE_NO_WARNINGS
#include "RegexpTrie.h"
#include <string>
#include <map>
#include <vector>
#include <algorithm>
struct TrieNode
{
std::map<char, TrieNode> nodes;
};
static TrieNode g_head;
static char* g_pPattern = nullptr;
static size_t g_patternLen = 0;
void Add(std::string word)
{
auto pTrie = &g_head.nodes;
for (auto i = 0; i < word.length(); i++)
{
auto chr = word[i];
if (pTrie->find(chr) == pTrie->end())
{
(*pTrie)[chr].nodes.clear();
}
pTrie = &(*pTrie)[chr].nodes;
}
(*pTrie)['*'].nodes.clear();
}
void JoinString(std::vector<std::string>& sequences, std::string& outStr, std::string delimiter)
{
outStr = "";
for (auto it = sequences.begin(); it != sequences.end(); it++)
{
auto str = *it;
if (str == "*")
break;
if (it != sequences.begin())
outStr += delimiter;
outStr += str;
}
}
void ToRegixp(std::map<char, TrieNode>* pTrie, std::string& pattern)
{
if (pTrie->size() == 0)
return;
else if (pTrie->size() == 1)
{
auto it = pTrie->begin();
auto pair = *it;
auto key = pair.first;
auto value = pair.second;
if (key == '*')
{
return;
}
else
{
pattern += key;
ToRegixp(&(*pTrie)[key].nodes, pattern);
return;
}
}
else
{
std::vector<std::string> sequences;
for (auto it = pTrie->begin(); it != pTrie->end(); it++)
{
auto pair = *it;
auto key = pair.first;
auto value = pair.second;
std::string str = "";
str += key;
ToRegixp(&(*pTrie)[key].nodes, str);
sequences.push_back(str);
}
std::sort(sequences.rbegin(), sequences.rend());
std::string result = "";
if (sequences.size() == 1)
{
result = sequences[0];
if (sequences[0].size() > 1)
{
result = "(?:" + result + ")";
}
}
else
{
std::string join = "";
JoinString(sequences, join, "");
if (sequences.size() == join.size())
result = "[" + join + "]";
else
{
std::string joinOr = "";
JoinString(sequences, joinOr, "|");
result = "(?:" + joinOr + ")";
}
if (pTrie->find('*') != pTrie->end())
result = result + "?";
}
pattern += result;
}
}
static bool g_initialized = false;
void RegexpTrie::Initialize(void)
{
if (g_initialized == true)
return;
g_initialized = true;
g_pPattern = nullptr;
g_patternLen = 0;
}
void RegexpTrie::Terminate(void)
{
free(g_pPattern);
g_pPattern = nullptr;
g_initialized = false;
}
const char* RegexpTrie::ToRegexPattern(const char** ppWords, size_t size)
{
g_head.nodes.clear();
for (auto i = 0; i < size; i++)
{
Add(ppWords[i]);
}
std::string pattern = "";
ToRegixp(&g_head.nodes, pattern);
if (g_patternLen < pattern.size())
{
free(g_pPattern);
g_pPattern = (char*)malloc(pattern.size() + 1);
if (g_pPattern != nullptr)
{
g_patternLen = pattern.size();
g_pPattern[g_patternLen] = '\0';
}
}
if (g_pPattern == nullptr)
return "";
strncpy(g_pPattern, pattern.c_str(), g_patternLen);
return g_pPattern;
}
#include "RegexpTrie.h"
#include <stdio.h>
#include <regex>
int main()
{
RegexpTrie::Initialize();
const char* pWords[] =
{
"lemon",
"lime",
"pomelo",
"orange",
"citron",
"grapefruit",
"grape",
"tangerine",
"tangelo",
};
const size_t wordNum = sizeof(pWords) / sizeof(char*);
auto pPattern = RegexpTrie::ToRegexPattern(pWords, wordNum);
printf("%s\n", pPattern);
auto regex = std::regex(pPattern);
const char* pAddWords[] =
{
"olemon",
"lemon0",
"doraemon",
};
const size_t addWordNum = sizeof(pAddWords) / sizeof(char*);
auto ppSearchWords = new char* [wordNum + addWordNum];
memcpy(ppSearchWords, pWords, sizeof(char*) * wordNum);
memcpy(&ppSearchWords[wordNum], pAddWords, sizeof(char*) * addWordNum);
const size_t searchNum = wordNum + addWordNum;
for (int i = 0; i < searchNum; i++)
{
std::cmatch cmatch;
auto res = std::regex_search(ppSearchWords[i], cmatch, regex);
printf("%s:%d match:%s prefix:%s sufix:%s\n", ppSearchWords[i], res, cmatch.str().c_str(), cmatch.prefix().str().c_str(), cmatch.suffix().str().c_str());
}
delete[] ppSearchWords;
RegexpTrie::Terminate();
}
参考
https://github.com/s9e/RegexpBuilder
https://github.com/ermanh/trieregex