Skip to content

Support list variables in match_dicts #72

@sam-writer

Description

@sam-writer

In our newest project we are using a wrapped version of replacy to support list variables in match_dicts, like so

import json
import os
from typing import List

from replacy import ReplaceMatcher
from replacy.db import load_json as load_replacy_files_from_directory


here = os.path.abspath(os.path.dirname(__file__))


class ModifiedReplaceMatcher:
    def __init__(self):
        rd_path = os.path.join(here, "resources/match_dicts")
        proto_match_dict = load_replacy_files_from_directory(rd_path)
        vocab_refs = self._load_vocab_refs("resources/variables/vocab_refs.json")
        self.rmatch_dict = self._refine_match_dict(proto_match_dict, vocab_refs)

    def _load_vocab_refs(self, vocab_refs_path: str):
        file_path = os.path.join(here, vocab_refs_path)
        with open(file_path, "r", encoding="utf-8") as f:
            return json.load(f)

    def _remove_square_brackets_from_list_of_strings(self, l: List[str]) -> str:
        """
        look at me, I'm metaprogramming
        turns ["a", "b", "c"]
        into '"a", "b", "c"'
        """
        list_str = '"'
        list_str += '", "'.join(l)
        list_str += '"'
        return list_str

    def _refine_match_dict(self, match_dict: dict, vocab_refs: dict) -> dict:
        """
        Replace $REF:something by vocab list
        from vocab_refs.json file
        This should probably be replaCy functionality
        And we could add functionality if we did fancier parsing
        """
        r_matcher_str = json.dumps(match_dict)
        for ref_id, ref_list in vocab_refs.items():
            # this is a sin
            ref_list_str = self._remove_square_brackets_from_list_of_strings(ref_list)
            target = f'"$REF:{ref_id}"'
            r_matcher_str = r_matcher_str.replace(target, ref_list_str)
            # end sin
        return json.loads(r_matcher_str)

    def get_matcher(self, nlp, kenlm_path):
        return ReplaceMatcher(nlp, match_dict=self.rmatch_dict, lm_path=kenlm_path)

Where resources/variables/vocab_refs.json would have an entry like

{
  "variable-name": [
    "hello",
    "hi",
    "yo"
  ]
}

This allows for a match dict syntax like:

[
  {"LOWER":{"IN": ["$REF:variable-name"]}}
]

which is convenient for frequently-used lists of words. Lists are easier than dicts though.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions