Skip to content

term_cleaner

TermCleaner

A class to clean clinical terms by removing non-essential modifiers and information using a Large Language Model (LLM).

Source code in src/ariadne/term_cleanup/term_cleaner.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class TermCleaner:
    """
    A class to clean clinical terms by removing non-essential modifiers and information using a Large Language Model (LLM).
    """

    def __init__(self, config: Config = Config()):
        self.system_prompt = config.term_cleaning.system_prompt
        self.cost = 0.0

    def clean_term(self, term: str) -> str:
        """
        Cleans a clinical term using an LLM to remove non-essential modifiers and information.

        Args:
            term: The clinical term to be cleaned.

        Returns:
            The cleaned clinical term.
        """

        if re.search(_TRIGGER_PATTERN, term, flags=re.IGNORECASE) is None:
            return term
        prompt = f"#Term: {term}"
        response = get_llm_response(prompt=prompt, system_prompt=self.system_prompt)
        self.cost += response["usage"]["total_cost_usd"]
        pattern = r"#Term: (.+)$"
        match = re.match(pattern, response["content"].strip())
        if match:
            return match.group(1)  # Returns the captured answer
        else:
            warnings.warn(f"Term {term} not found in response {response}")
            return term

    def clean_terms(
        self, df: pd.DataFrame, term_column: str = "source_term", output_column: str = "cleaned_term"
    ) -> pd.DataFrame:
        """
        Cleans clinical terms in a DataFrame column using the LLM.

        Args:
            df: DataFrame containing the terms to be cleaned.
            term_column: Name of the column with terms to be cleaned.
            output_column: Name of the column to store cleaned terms.

        Returns:
            DataFrame with an additional column for cleaned terms.
        """

        df[output_column] = df[term_column].apply(self.clean_term)
        return df

    def get_total_cost(self) -> float:
        """
        Returns the total cost incurred for LLM calls during term cleaning.

        Returns:
            Total cost in USD.
        """

        return self.cost

clean_term(term)

Cleans a clinical term using an LLM to remove non-essential modifiers and information.

Parameters:

Name Type Description Default
term str

The clinical term to be cleaned.

required

Returns:

Type Description
str

The cleaned clinical term.

Source code in src/ariadne/term_cleanup/term_cleaner.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def clean_term(self, term: str) -> str:
    """
    Cleans a clinical term using an LLM to remove non-essential modifiers and information.

    Args:
        term: The clinical term to be cleaned.

    Returns:
        The cleaned clinical term.
    """

    if re.search(_TRIGGER_PATTERN, term, flags=re.IGNORECASE) is None:
        return term
    prompt = f"#Term: {term}"
    response = get_llm_response(prompt=prompt, system_prompt=self.system_prompt)
    self.cost += response["usage"]["total_cost_usd"]
    pattern = r"#Term: (.+)$"
    match = re.match(pattern, response["content"].strip())
    if match:
        return match.group(1)  # Returns the captured answer
    else:
        warnings.warn(f"Term {term} not found in response {response}")
        return term

clean_terms(df, term_column='source_term', output_column='cleaned_term')

Cleans clinical terms in a DataFrame column using the LLM.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing the terms to be cleaned.

required
term_column str

Name of the column with terms to be cleaned.

'source_term'
output_column str

Name of the column to store cleaned terms.

'cleaned_term'

Returns:

Type Description
DataFrame

DataFrame with an additional column for cleaned terms.

Source code in src/ariadne/term_cleanup/term_cleaner.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def clean_terms(
    self, df: pd.DataFrame, term_column: str = "source_term", output_column: str = "cleaned_term"
) -> pd.DataFrame:
    """
    Cleans clinical terms in a DataFrame column using the LLM.

    Args:
        df: DataFrame containing the terms to be cleaned.
        term_column: Name of the column with terms to be cleaned.
        output_column: Name of the column to store cleaned terms.

    Returns:
        DataFrame with an additional column for cleaned terms.
    """

    df[output_column] = df[term_column].apply(self.clean_term)
    return df

get_total_cost()

Returns the total cost incurred for LLM calls during term cleaning.

Returns:

Type Description
float

Total cost in USD.

Source code in src/ariadne/term_cleanup/term_cleaner.py
80
81
82
83
84
85
86
87
88
def get_total_cost(self) -> float:
    """
    Returns the total cost incurred for LLM calls during term cleaning.

    Returns:
        Total cost in USD.
    """

    return self.cost