Skip to content

term_normalizer

TermNormalizer

Normalizes clinical term strings for high-precision matching.

Source code in src/ariadne/verbatim_mapping/term_normalizer.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
class TermNormalizer:
    """
    Normalizes clinical term strings for high-precision matching.
    """

    def __init__(self, substrings_to_remove: List[str] | None = None):
        self.substrings_to_remove = substrings_to_remove or []
        try:
            self.nlp = spacy.load("en_core_web_sm")
            print("spaCy model 'en_core_web_sm' loaded successfully.")
        except IOError:
            print("spaCy model 'en_core_web_sm' not found.")
            print("Please run: python -m spacy download en_core_web_sm")
            raise

    def normalize_term(self, term: str) -> str:
        """
        Normalizes a clinical term string for high-precision matching.

        The pipeline is:

        1. Convert to lowercase.
        2. Remove possessive "'s" at the end of words.
        3. Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the
           substrings_to_remove list in the config_condition_mapping.yaml file
        4. Remove all punctuation.
        5. Tokenize and lemmatize (e.g., "disorders" -> "disorder").
        6. Join tokens into a single string, preserving order.

        This makes "liver disorders" and "Liver-Disorders (disorder)"
        both normalize to "liver disorder".

        Args:
            term: The clinical term string to normalize.
        Returns:
            The normalized term string.
        """
        # 1. Convert to lowercase
        term = term.lower()

        # 2. Remove possessive 's at the end of a word
        # This handles "Alzheimer's disease" -> "Alzheimer disease"
        # It finds a word character (\w) followed by 's and a word boundary (\b),
        # and replaces the whole thing with just the captured word character (group 1).
        term = re.sub(r"(\w)'s\b", r"\1", term)

        # 3. Remove specific non-informative substrings
        for sub in self.substrings_to_remove:
            term = term.replace(sub, ' ')

        # 4. Remove all punctuation (replace with a space)
        # This handles "liver-disorder" and "liver, disorder"
        term = re.sub(r'[^\w\s]', ' ', term)

        # 5. Tokenize and lemmatize using spaCy
        doc = self.nlp(term)

        processed_tokens = []
        for token in doc:
            # Get the lemma (base form)
            lemma = token.lemma_

            # 6. Remove empty tokens (from extra spaces)
            if lemma.strip():
                processed_tokens.append(lemma)

        # 7. Join tokens into a single string
        return " ".join(processed_tokens)

    def normalize_terms(self, terms: List[str], batch_size: int = 1000, n_process: int = 4) -> List[str]:
        """
        Normalizes a list of clinical term strings in batch using spaCy's nlp.pipe for efficiency.

        Args:
            terms: List of clinical term strings to normalize.
            batch_size: Number of terms to process in each spaCy batch.
            n_process: Number of worker processes for spaCy's pipe (1 disables multiprocessing).
        Returns:
            List of normalized term strings in the same order as the input.
        """
        # Pre-process steps 1-4 before sending to spaCy
        preprocessed = []
        for term in terms:
            t = term.lower()
            t = re.sub(r"(\w)'s\b", r"\1", t)
            for sub in self.substrings_to_remove:
                t = t.replace(sub, ' ')
            t = re.sub(r'[^\w\s]', ' ', t)
            preprocessed.append(t)

        # Use spaCy's built-in batch processing; n_process>1 enables multiprocessing.
        results = []
        for doc in self.nlp.pipe(preprocessed, batch_size=batch_size, n_process=n_process):
            processed_tokens = [token.lemma_ for token in doc if token.lemma_.strip()]
            results.append(" ".join(processed_tokens))
        return results

normalize_term(term)

Normalizes a clinical term string for high-precision matching.

The pipeline is:

  1. Convert to lowercase.
  2. Remove possessive "'s" at the end of words.
  3. Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the substrings_to_remove list in the config_condition_mapping.yaml file
  4. Remove all punctuation.
  5. Tokenize and lemmatize (e.g., "disorders" -> "disorder").
  6. Join tokens into a single string, preserving order.

This makes "liver disorders" and "Liver-Disorders (disorder)" both normalize to "liver disorder".

Parameters:

Name Type Description Default
term str

The clinical term string to normalize.

required

Returns: The normalized term string.

Source code in src/ariadne/verbatim_mapping/term_normalizer.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def normalize_term(self, term: str) -> str:
    """
    Normalizes a clinical term string for high-precision matching.

    The pipeline is:

    1. Convert to lowercase.
    2. Remove possessive "'s" at the end of words.
    3. Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the
       substrings_to_remove list in the config_condition_mapping.yaml file
    4. Remove all punctuation.
    5. Tokenize and lemmatize (e.g., "disorders" -> "disorder").
    6. Join tokens into a single string, preserving order.

    This makes "liver disorders" and "Liver-Disorders (disorder)"
    both normalize to "liver disorder".

    Args:
        term: The clinical term string to normalize.
    Returns:
        The normalized term string.
    """
    # 1. Convert to lowercase
    term = term.lower()

    # 2. Remove possessive 's at the end of a word
    # This handles "Alzheimer's disease" -> "Alzheimer disease"
    # It finds a word character (\w) followed by 's and a word boundary (\b),
    # and replaces the whole thing with just the captured word character (group 1).
    term = re.sub(r"(\w)'s\b", r"\1", term)

    # 3. Remove specific non-informative substrings
    for sub in self.substrings_to_remove:
        term = term.replace(sub, ' ')

    # 4. Remove all punctuation (replace with a space)
    # This handles "liver-disorder" and "liver, disorder"
    term = re.sub(r'[^\w\s]', ' ', term)

    # 5. Tokenize and lemmatize using spaCy
    doc = self.nlp(term)

    processed_tokens = []
    for token in doc:
        # Get the lemma (base form)
        lemma = token.lemma_

        # 6. Remove empty tokens (from extra spaces)
        if lemma.strip():
            processed_tokens.append(lemma)

    # 7. Join tokens into a single string
    return " ".join(processed_tokens)

normalize_terms(terms, batch_size=1000, n_process=4)

Normalizes a list of clinical term strings in batch using spaCy's nlp.pipe for efficiency.

Parameters:

Name Type Description Default
terms List[str]

List of clinical term strings to normalize.

required
batch_size int

Number of terms to process in each spaCy batch.

1000
n_process int

Number of worker processes for spaCy's pipe (1 disables multiprocessing).

4

Returns: List of normalized term strings in the same order as the input.

Source code in src/ariadne/verbatim_mapping/term_normalizer.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def normalize_terms(self, terms: List[str], batch_size: int = 1000, n_process: int = 4) -> List[str]:
    """
    Normalizes a list of clinical term strings in batch using spaCy's nlp.pipe for efficiency.

    Args:
        terms: List of clinical term strings to normalize.
        batch_size: Number of terms to process in each spaCy batch.
        n_process: Number of worker processes for spaCy's pipe (1 disables multiprocessing).
    Returns:
        List of normalized term strings in the same order as the input.
    """
    # Pre-process steps 1-4 before sending to spaCy
    preprocessed = []
    for term in terms:
        t = term.lower()
        t = re.sub(r"(\w)'s\b", r"\1", t)
        for sub in self.substrings_to_remove:
            t = t.replace(sub, ' ')
        t = re.sub(r'[^\w\s]', ' ', t)
        preprocessed.append(t)

    # Use spaCy's built-in batch processing; n_process>1 enables multiprocessing.
    results = []
    for doc in self.nlp.pipe(preprocessed, batch_size=batch_size, n_process=n_process):
        processed_tokens = [token.lemma_ for token in doc if token.lemma_.strip()]
        results.append(" ".join(processed_tokens))
    return results