term_cleaner

`TermCleaner`

A class to clean clinical terms by removing non-essential modifiers and information using a Large Language Model (LLM).

Source code in src/ariadne/term_cleanup/term_cleaner.py

class TermCleaner:
    """
    A class to clean clinical terms by removing non-essential modifiers and information using a Large Language Model (LLM).
    """

    def __init__(self, config: Config = Config()):
        self.system_prompt = config.term_cleaning.system_prompt
        self.cost = 0.0

    def clean_term(self, term: str) -> str:
        """
        Cleans a clinical term using an LLM to remove non-essential modifiers and information.

        Args:
            term: The clinical term to be cleaned.

        Returns:
            The cleaned clinical term.
        """

        if re.search(_TRIGGER_PATTERN, term, flags=re.IGNORECASE) is None:
            return term
        prompt = f"#Term: {term}"
        response = get_llm_response(prompt=prompt, system_prompt=self.system_prompt)
        self.cost += response["usage"]["total_cost_usd"]
        pattern = r"#Term: (.+)$"
        match = re.match(pattern, response["content"].strip())
        if match:
            return match.group(1)  # Returns the captured answer
        else:
            warnings.warn(f"Term {term} not found in response {response}")
            return term

    def clean_terms(
        self, df: pd.DataFrame, term_column: str = "source_term", output_column: str = "cleaned_term"
    ) -> pd.DataFrame:
        """
        Cleans clinical terms in a DataFrame column using the LLM.

        Args:
            df: DataFrame containing the terms to be cleaned.
            term_column: Name of the column with terms to be cleaned.
            output_column: Name of the column to store cleaned terms.

        Returns:
            DataFrame with an additional column for cleaned terms.
        """

        df[output_column] = df[term_column].apply(self.clean_term)
        return df

    def get_total_cost(self) -> float:
        """
        Returns the total cost incurred for LLM calls during term cleaning.

        Returns:
            Total cost in USD.
        """

        return self.cost

`clean_term(term)`

Cleans a clinical term using an LLM to remove non-essential modifiers and information.

Parameters:

Name	Type	Description	Default
`term`	`str`	The clinical term to be cleaned.	required

Returns:

Type	Description
`str`	The cleaned clinical term.

Source code in src/ariadne/term_cleanup/term_cleaner.py

def clean_term(self, term: str) -> str:
    """
    Cleans a clinical term using an LLM to remove non-essential modifiers and information.

    Args:
        term: The clinical term to be cleaned.

    Returns:
        The cleaned clinical term.
    """

    if re.search(_TRIGGER_PATTERN, term, flags=re.IGNORECASE) is None:
        return term
    prompt = f"#Term: {term}"
    response = get_llm_response(prompt=prompt, system_prompt=self.system_prompt)
    self.cost += response["usage"]["total_cost_usd"]
    pattern = r"#Term: (.+)$"
    match = re.match(pattern, response["content"].strip())
    if match:
        return match.group(1)  # Returns the captured answer
    else:
        warnings.warn(f"Term {term} not found in response {response}")
        return term

`clean_terms(df, term_column='source_term', output_column='cleaned_term')`

Cleans clinical terms in a DataFrame column using the LLM.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing the terms to be cleaned.	required
`term_column`	`str`	Name of the column with terms to be cleaned.	`'source_term'`
`output_column`	`str`	Name of the column to store cleaned terms.	`'cleaned_term'`

Returns:

Type	Description
`DataFrame`	DataFrame with an additional column for cleaned terms.

Source code in src/ariadne/term_cleanup/term_cleaner.py

def clean_terms(
    self, df: pd.DataFrame, term_column: str = "source_term", output_column: str = "cleaned_term"
) -> pd.DataFrame:
    """
    Cleans clinical terms in a DataFrame column using the LLM.

    Args:
        df: DataFrame containing the terms to be cleaned.
        term_column: Name of the column with terms to be cleaned.
        output_column: Name of the column to store cleaned terms.

    Returns:
        DataFrame with an additional column for cleaned terms.
    """

    df[output_column] = df[term_column].apply(self.clean_term)
    return df

`get_total_cost()`

Returns the total cost incurred for LLM calls during term cleaning.

Returns:

Type	Description
`float`	Total cost in USD.

Source code in src/ariadne/term_cleanup/term_cleaner.py

def get_total_cost(self) -> float:
    """
    Returns the total cost incurred for LLM calls during term cleaning.

    Returns:
        Total cost in USD.
    """

    return self.cost