ワードリストから正規表現のパターンを生成

環境

Visual C++ 2019

概要

ワードリスト(配列)から正規表現のパターンを生成します。

PerlRubyにはあるけどもc++が見つからなかったので参考を基に作成してみました。

下のコードのワードリストの結果だと (?:tange(?:rine|lo)|pomelo|orange|l(?:ime|emon)|grape(?:fruit)?|citron)

が生成されます。

コード

#pragma once

class RegexpTrie
{
public:
    static void Initialize(void);
    static void Terminate(void);
    static const char* ToRegexPattern(const char** ppWords, size_t size);
};
#define _CRT_SECURE_NO_WARNINGS
#include "RegexpTrie.h"
#include <string>
#include <map>
#include <vector>
#include <algorithm>

struct TrieNode
{
    std::map<char, TrieNode> nodes;
};
static TrieNode g_head;
static char* g_pPattern = nullptr;
static size_t g_patternLen = 0;

void Add(std::string word)
{
    auto pTrie = &g_head.nodes;
    for (auto i = 0; i < word.length(); i++)
    {
        auto chr = word[i];
        if (pTrie->find(chr) == pTrie->end())
        {
            (*pTrie)[chr].nodes.clear();
        }
        pTrie = &(*pTrie)[chr].nodes;
    }
    (*pTrie)['*'].nodes.clear();
}

void JoinString(std::vector<std::string>& sequences, std::string& outStr, std::string delimiter)
{
    outStr = "";
    for (auto it = sequences.begin(); it != sequences.end(); it++)
    {
        auto str = *it;
        if (str == "*")
            break;
        if (it != sequences.begin())
            outStr += delimiter;
        outStr += str;
    }
}

void ToRegixp(std::map<char, TrieNode>* pTrie, std::string& pattern)
{
    if (pTrie->size() == 0)
        return;
    else if (pTrie->size() == 1)
    {
        auto it = pTrie->begin();
        auto pair = *it;
        auto key = pair.first;
        auto value = pair.second;
        if (key == '*')
        {
            return;
        }
        else
        {
            pattern += key;
            ToRegixp(&(*pTrie)[key].nodes, pattern);
            return;
        }
    }
    else
    {
        std::vector<std::string> sequences;
        for (auto it = pTrie->begin(); it != pTrie->end(); it++)
        {
            auto pair = *it;
            auto key = pair.first;
            auto value = pair.second;
            std::string str = "";
            str += key;
            ToRegixp(&(*pTrie)[key].nodes, str);
            sequences.push_back(str);
        }
        std::sort(sequences.rbegin(), sequences.rend());

        std::string result = "";
        if (sequences.size() == 1)
        {
            result = sequences[0];
            if (sequences[0].size() > 1)
            {
                result = "(?:" + result + ")";
            }
        }
        else
        {
            std::string join = "";
            JoinString(sequences, join, "");
            if (sequences.size() == join.size())
                result = "[" + join + "]";
            else
            {
                std::string joinOr = "";
                JoinString(sequences, joinOr, "|");
                result = "(?:" + joinOr + ")";
            }

            if (pTrie->find('*') != pTrie->end())
                result = result + "?";
        }
        pattern += result;
    }
}

static bool g_initialized = false;
void RegexpTrie::Initialize(void)
{
    if (g_initialized == true)
        return;
    g_initialized = true;

    g_pPattern = nullptr;
    g_patternLen = 0;
}

void RegexpTrie::Terminate(void)
{
    free(g_pPattern);
    g_pPattern = nullptr;

    g_initialized = false;
}

const char* RegexpTrie::ToRegexPattern(const char** ppWords, size_t size)
{
    g_head.nodes.clear();
    for (auto i = 0; i < size; i++)
    {
        Add(ppWords[i]);
    }
    std::string pattern = "";
    ToRegixp(&g_head.nodes, pattern);
    
    if (g_patternLen < pattern.size())
    {
        free(g_pPattern);
        g_pPattern = (char*)malloc(pattern.size() + 1);
        if (g_pPattern != nullptr)
        {
            g_patternLen = pattern.size();
            g_pPattern[g_patternLen] = '\0';
        }
    }
    if (g_pPattern == nullptr)
        return "";
    strncpy(g_pPattern, pattern.c_str(), g_patternLen);
    return g_pPattern;
}
#include "RegexpTrie.h"
#include <stdio.h>
#include <regex>

int main()
{
    RegexpTrie::Initialize();
    const char* pWords[] =
    {
        "lemon",
        "lime",
        "pomelo",
        "orange",
        "citron",
        "grapefruit",
        "grape",
        "tangerine",
        "tangelo",
    };
    const size_t wordNum = sizeof(pWords) / sizeof(char*);
    auto pPattern = RegexpTrie::ToRegexPattern(pWords, wordNum);
    printf("%s\n", pPattern);
    auto regex = std::regex(pPattern);

    const char* pAddWords[] =
    {
        "olemon",
        "lemon0",
        "doraemon",
    };
    const size_t addWordNum = sizeof(pAddWords) / sizeof(char*);
    auto ppSearchWords = new char* [wordNum + addWordNum];
    memcpy(ppSearchWords, pWords, sizeof(char*) * wordNum);
    memcpy(&ppSearchWords[wordNum], pAddWords, sizeof(char*) * addWordNum);
    const size_t searchNum = wordNum + addWordNum;

    for (int i = 0; i < searchNum; i++)
    {
        std::cmatch cmatch;
        auto res = std::regex_search(ppSearchWords[i], cmatch, regex);
        printf("%s:%d match:%s prefix:%s sufix:%s\n", ppSearchWords[i], res, cmatch.str().c_str(), cmatch.prefix().str().c_str(), cmatch.suffix().str().c_str());
    }

    delete[] ppSearchWords;
    RegexpTrie::Terminate();
}

参考

正規表現じぇねれーた

https://github.com/s9e/RegexpBuilder

https://github.com/ermanh/trieregex

https://github.com/itchyny/rassemble-go

https://github.com/sters/php-regexp-trie