Skip to content

term_normalizer

TermNormalizer

Normalizes clinical term strings for high-precision matching.

Source code in src/ariadne/verbatim_mapping/term_normalizer.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
class TermNormalizer:
    """
    Normalizes clinical term strings for high-precision matching.
    """

    def __init__(self, config: Config = Config()):
        self.config = config
        try:
            self.nlp = spacy.load("en_core_web_sm")
            print("spaCy model 'en_core_web_sm' loaded successfully.")
        except IOError:
            print("spaCy model 'en_core_web_sm' not found.")
            print("Please run: python -m spacy download en_core_web_sm")
            raise

    def normalize_term(self, term: str) -> str:
        """
        Normalizes a clinical term string for high-precision matching.

        The pipeline is:

        1. Convert to lowercase.
        2. Remove possessive "'s" at the end of words.
        3. Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the
           substrings_to_remove list in the config yaml file
        4. Remove all punctuation.
        5. Tokenize and lemmatize (e.g., "disorders" -> "disorder").
        6. Join tokens into a single string, preserving order.

        This makes "liver disorders" and "Liver-Disorders (disorder)"
        both normalize to "liver disorder".

        Args:
            term: The clinical term string to normalize.
        Returns:
            The normalized term string.
        """
        # 1. Convert to lowercase
        term = term.lower()

        # 2. Remove possessive 's at the end of a word
        # This handles "Alzheimer's disease" -> "Alzheimer disease"
        # It finds a word character (\w) followed by 's and a word boundary (\b),
        # and replaces the whole thing with just the captured word character (group 1).
        term = re.sub(r"(\w)'s\b", r"\1", term)

        # 3. Remove specific non-informative substrings
        for sub in self.config.verbatim_mapping.substrings_to_remove:
            term = term.replace(sub, ' ')

        # 4. Remove all punctuation (replace with a space)
        # This handles "liver-disorder" and "liver, disorder"
        term = re.sub(r'[^\w\s]', ' ', term)

        # 5. Tokenize and lemmatize using spaCy
        doc = self.nlp(term)

        processed_tokens = []
        for token in doc:
            # Get the lemma (base form)
            lemma = token.lemma_

            # 6. Remove empty tokens (from extra spaces)
            if lemma.strip():
                processed_tokens.append(lemma)

        # 7. Join tokens into a single string
        return " ".join(processed_tokens)

normalize_term(term)

Normalizes a clinical term string for high-precision matching.

The pipeline is:

  1. Convert to lowercase.
  2. Remove possessive "'s" at the end of words.
  3. Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the substrings_to_remove list in the config yaml file
  4. Remove all punctuation.
  5. Tokenize and lemmatize (e.g., "disorders" -> "disorder").
  6. Join tokens into a single string, preserving order.

This makes "liver disorders" and "Liver-Disorders (disorder)" both normalize to "liver disorder".

Parameters:

Name Type Description Default
term str

The clinical term string to normalize.

required

Returns: The normalized term string.

Source code in src/ariadne/verbatim_mapping/term_normalizer.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def normalize_term(self, term: str) -> str:
    """
    Normalizes a clinical term string for high-precision matching.

    The pipeline is:

    1. Convert to lowercase.
    2. Remove possessive "'s" at the end of words.
    3. Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the
       substrings_to_remove list in the config yaml file
    4. Remove all punctuation.
    5. Tokenize and lemmatize (e.g., "disorders" -> "disorder").
    6. Join tokens into a single string, preserving order.

    This makes "liver disorders" and "Liver-Disorders (disorder)"
    both normalize to "liver disorder".

    Args:
        term: The clinical term string to normalize.
    Returns:
        The normalized term string.
    """
    # 1. Convert to lowercase
    term = term.lower()

    # 2. Remove possessive 's at the end of a word
    # This handles "Alzheimer's disease" -> "Alzheimer disease"
    # It finds a word character (\w) followed by 's and a word boundary (\b),
    # and replaces the whole thing with just the captured word character (group 1).
    term = re.sub(r"(\w)'s\b", r"\1", term)

    # 3. Remove specific non-informative substrings
    for sub in self.config.verbatim_mapping.substrings_to_remove:
        term = term.replace(sub, ' ')

    # 4. Remove all punctuation (replace with a space)
    # This handles "liver-disorder" and "liver, disorder"
    term = re.sub(r'[^\w\s]', ' ', term)

    # 5. Tokenize and lemmatize using spaCy
    doc = self.nlp(term)

    processed_tokens = []
    for token in doc:
        # Get the lemma (base form)
        lemma = token.lemma_

        # 6. Remove empty tokens (from extra spaces)
        if lemma.strip():
            processed_tokens.append(lemma)

    # 7. Join tokens into a single string
    return " ".join(processed_tokens)