term_normalizer

`TermNormalizer`

Normalizes clinical term strings for high-precision matching.

Source code in src/ariadne/verbatim_mapping/term_normalizer.py

class TermNormalizer:
    """
    Normalizes clinical term strings for high-precision matching.
    """

    def __init__(self, substrings_to_remove: List[str] | None = None):
        self.substrings_to_remove = substrings_to_remove or []
        try:
            self.nlp = spacy.load("en_core_web_sm")
            print("spaCy model 'en_core_web_sm' loaded successfully.")
        except IOError:
            print("spaCy model 'en_core_web_sm' not found.")
            print("Please run: python -m spacy download en_core_web_sm")
            raise

    def normalize_term(self, term: str) -> str:
        """
        Normalizes a clinical term string for high-precision matching.

        The pipeline is:

        1. Convert to lowercase.
        2. Remove possessive "'s" at the end of words.
        3. Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the
           substrings_to_remove list in the config_condition_mapping.yaml file
        4. Remove all punctuation.
        5. Tokenize and lemmatize (e.g., "disorders" -> "disorder").
        6. Join tokens into a single string, preserving order.

        This makes "liver disorders" and "Liver-Disorders (disorder)"
        both normalize to "liver disorder".

        Args:
            term: The clinical term string to normalize.
        Returns:
            The normalized term string.
        """
        # 1. Convert to lowercase
        term = term.lower()

        # 2. Remove possessive 's at the end of a word
        # This handles "Alzheimer's disease" -> "Alzheimer disease"
        # It finds a word character (\w) followed by 's and a word boundary (\b),
        # and replaces the whole thing with just the captured word character (group 1).
        term = re.sub(r"(\w)'s\b", r"\1", term)

        # 3. Remove specific non-informative substrings
        for sub in self.substrings_to_remove:
            term = term.replace(sub, ' ')

        # 4. Remove all punctuation (replace with a space)
        # This handles "liver-disorder" and "liver, disorder"
        term = re.sub(r'[^\w\s]', ' ', term)

        # 5. Tokenize and lemmatize using spaCy
        doc = self.nlp(term)

        processed_tokens = []
        for token in doc:
            # Get the lemma (base form)
            lemma = token.lemma_

            # 6. Remove empty tokens (from extra spaces)
            if lemma.strip():
                processed_tokens.append(lemma)

        # 7. Join tokens into a single string
        return " ".join(processed_tokens)

    def normalize_terms(self, terms: List[str], batch_size: int = 1000, n_process: int = 4) -> List[str]:
        """
        Normalizes a list of clinical term strings in batch using spaCy's nlp.pipe for efficiency.

        Args:
            terms: List of clinical term strings to normalize.
            batch_size: Number of terms to process in each spaCy batch.
            n_process: Number of worker processes for spaCy's pipe (1 disables multiprocessing).
        Returns:
            List of normalized term strings in the same order as the input.
        """
        # Pre-process steps 1-4 before sending to spaCy
        preprocessed = []
        for term in terms:
            t = term.lower()
            t = re.sub(r"(\w)'s\b", r"\1", t)
            for sub in self.substrings_to_remove:
                t = t.replace(sub, ' ')
            t = re.sub(r'[^\w\s]', ' ', t)
            preprocessed.append(t)

        # Use spaCy's built-in batch processing; n_process>1 enables multiprocessing.
        results = []
        for doc in self.nlp.pipe(preprocessed, batch_size=batch_size, n_process=n_process):
            processed_tokens = [token.lemma_ for token in doc if token.lemma_.strip()]
            results.append(" ".join(processed_tokens))
        return results

`normalize_term(term)`

Normalizes a clinical term string for high-precision matching.

The pipeline is:

Convert to lowercase.
Remove possessive "'s" at the end of words.
Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the substrings_to_remove list in the config_condition_mapping.yaml file
Remove all punctuation.
Tokenize and lemmatize (e.g., "disorders" -> "disorder").
Join tokens into a single string, preserving order.

This makes "liver disorders" and "Liver-Disorders (disorder)" both normalize to "liver disorder".

Parameters:

Name	Type	Description	Default
`term`	`str`	The clinical term string to normalize.	required

Returns: The normalized term string.

Source code in src/ariadne/verbatim_mapping/term_normalizer.py

def normalize_term(self, term: str) -> str:
    """
    Normalizes a clinical term string for high-precision matching.

    The pipeline is:

    1. Convert to lowercase.
    2. Remove possessive "'s" at the end of words.
    3. Remove specific non-informative substrings (e.g., '(disorder)'). The strings are taken from the
       substrings_to_remove list in the config_condition_mapping.yaml file
    4. Remove all punctuation.
    5. Tokenize and lemmatize (e.g., "disorders" -> "disorder").
    6. Join tokens into a single string, preserving order.

    This makes "liver disorders" and "Liver-Disorders (disorder)"
    both normalize to "liver disorder".

    Args:
        term: The clinical term string to normalize.
    Returns:
        The normalized term string.
    """
    # 1. Convert to lowercase
    term = term.lower()

    # 2. Remove possessive 's at the end of a word
    # This handles "Alzheimer's disease" -> "Alzheimer disease"
    # It finds a word character (\w) followed by 's and a word boundary (\b),
    # and replaces the whole thing with just the captured word character (group 1).
    term = re.sub(r"(\w)'s\b", r"\1", term)

    # 3. Remove specific non-informative substrings
    for sub in self.substrings_to_remove:
        term = term.replace(sub, ' ')

    # 4. Remove all punctuation (replace with a space)
    # This handles "liver-disorder" and "liver, disorder"
    term = re.sub(r'[^\w\s]', ' ', term)

    # 5. Tokenize and lemmatize using spaCy
    doc = self.nlp(term)

    processed_tokens = []
    for token in doc:
        # Get the lemma (base form)
        lemma = token.lemma_

        # 6. Remove empty tokens (from extra spaces)
        if lemma.strip():
            processed_tokens.append(lemma)

    # 7. Join tokens into a single string
    return " ".join(processed_tokens)

`normalize_terms(terms, batch_size=1000, n_process=4)`

Normalizes a list of clinical term strings in batch using spaCy's nlp.pipe for efficiency.

Parameters:

Name	Type	Description	Default
`terms`	`List[str]`	List of clinical term strings to normalize.	required
`batch_size`	`int`	Number of terms to process in each spaCy batch.	`1000`
`n_process`	`int`	Number of worker processes for spaCy's pipe (1 disables multiprocessing).	`4`

Returns: List of normalized term strings in the same order as the input.

Source code in src/ariadne/verbatim_mapping/term_normalizer.py

def normalize_terms(self, terms: List[str], batch_size: int = 1000, n_process: int = 4) -> List[str]:
    """
    Normalizes a list of clinical term strings in batch using spaCy's nlp.pipe for efficiency.

    Args:
        terms: List of clinical term strings to normalize.
        batch_size: Number of terms to process in each spaCy batch.
        n_process: Number of worker processes for spaCy's pipe (1 disables multiprocessing).
    Returns:
        List of normalized term strings in the same order as the input.
    """
    # Pre-process steps 1-4 before sending to spaCy
    preprocessed = []
    for term in terms:
        t = term.lower()
        t = re.sub(r"(\w)'s\b", r"\1", t)
        for sub in self.substrings_to_remove:
            t = t.replace(sub, ' ')
        t = re.sub(r'[^\w\s]', ' ', t)
        preprocessed.append(t)

    # Use spaCy's built-in batch processing; n_process>1 enables multiprocessing.
    results = []
    for doc in self.nlp.pipe(preprocessed, batch_size=batch_size, n_process=n_process):
        processed_tokens = [token.lemma_ for token in doc if token.lemma_.strip()]
        results.append(" ".join(processed_tokens))
    return results