Skip to content

hecate_concept_searcher

HecateConceptSearcher

Bases: AbstractConceptSearcher

A concept searcher that uses the OHDSI Hecate API to find concepts based on query strings.

Source code in src/ariadne/vector_search/hecate_concept_searcher.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
class HecateConceptSearcher(AbstractConceptSearcher):

    """
    A concept searcher that uses the OHDSI Hecate API to find concepts based on query strings.
    """

    def __init__(self, for_evaluation: bool = False):
        """
        Initializes the HecateConceptSearcher.

        Args:
            for_evaluation: If True, configures the searcher for evaluation purposes.
        """
        self.for_evaluation = for_evaluation

        if for_evaluation:
            print("HecateConceptSearcher initialized in evaluation mode.")
            self.default_params = {
                "standard_concept": "S",
                "domain_id": "Condition,Observation,Measurement,Procedure",
                "concept_class_id": "3-dig billing code,3-dig nonbill code,4-dig billing code,Answer,Claims Attachment,Clinical Finding,Clinical Observation,Context-dependent,CPT4,CPT4 Modifier,Disorder,Event,Genetic Variation,HCPCS,Histopattern,ICD10PCS,ICD10PCS Hierarchy,ICDO Condition,ICDO Histology,Ingredient,Lab Test,MDC,Metastasis,MS-DRG,NAACCR Variable,Observable Entity,Procedure,Question,Social Context,Staging / Scales,Staging/Grading,Survey,Topic,Topography,Value,Variable",
                "exclude_vocabulary_id": "ICD9CM,ICD10CM,ICD10,ICD10CN,ICD10GM,CIM10,ICDO3,KCD7,Read",
            }
        else:
            print("HecateConceptSearcher initialized in standard mode.")
            self.default_params = {
                "standard_concept": "S",
            }

    def search_term(self, query_string: str, limit: int = 25) -> DataFrame:
        """
        Searches for concepts matching the given query string.

        Args:
            query_string: The term to search for.
            limit: The maximum number of results to return.

        Returns:
            A DataFrame containing the matching concepts, with the same columns as the concept table in the OMOP CDM,
            plus a 'score' column indicating the relevance score from the search.

        """

        params = {"q": query_string, "limit": limit}
        params.update(self.default_params)

        try:
            response = requests.get(_HECATE_URL, params=params, timeout=15)
            response.raise_for_status()
            terms = response.json()
            concepts = []
            for term in terms:
                # add score:
                for concept in term.get("concepts", []):
                    concept["score"] = term.get("score", None)
                concepts.extend(term.get("concepts", []))
            return pd.DataFrame(concepts)

        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred: {http_err}")
            print(f"Response status code: {response.status_code}")
            print(f"Response content: {response.text}")
        except requests.exceptions.ConnectionError as conn_err:
            print(f"Connection error occurred: {conn_err}")
        except requests.exceptions.Timeout as timeout_err:
            print(f"The request timed out: {timeout_err}")
        except requests.exceptions.RequestException as err:
            print(f"An unexpected error occurred: {err}")

        return None

    def search_terms(
        self,
        df: pd.DataFrame,
        term_column: str,
        matched_concept_id_column: str = "matched_concept_id",
        matched_concept_name_column: str = "matched_concept_name",
        match_score_column: str = "match_score",
        match_rank_column: str = "match_rank",
        limit: int = 25,
    ) -> pd.DataFrame:
        """
        Searches the Hecate API for concepts matching terms in a DataFrame column.

        Args:
            df: DataFrame containing the terms to search for.
            term_column: Name of the column with terms to search.
            matched_concept_id_column: Name of the column to store matched concept IDs.
            matched_concept_name_column: Name of the column to store matched concept names.
            match_score_column: Name of the column to store match scores.
            match_rank_column: Name of the column to store match ranks.
            limit: The maximum number of results to return for each term.

        Returns:
            A DataFrame containing the same columns as the input dataframe plus the matching concepts for each term. For
            each term in the input dataframe, multiple rows will be returned corresponding to each matching concept.

        """

        all_results = []
        for index, row in df.iterrows():
            term = row[term_column]
            print(f"Processing term '{term}'")
            results = self.search_term(term, limit=limit)
            if results is not None:
                rows = []
                for rank, (_, concept) in enumerate(results.iterrows(), start=1):
                    rows.append(
                        {
                            matched_concept_id_column: concept["concept_id"],
                            matched_concept_name_column: concept["concept_name"],
                            match_score_column: concept["score"],
                            match_rank_column: rank,
                        }
                    )
                results = pd.DataFrame(rows)
                results[match_rank_column] = range(1, len(results) + 1)
                orig_cols = list(df.columns)
                new_columns = list(results.columns)
                results[term_column] = term
                for col in df.columns:
                    results[col] = row[col]
                results = results[orig_cols + new_columns]
                all_results.append(results)

        all_results = pd.concat(all_results)
        return all_results

__init__(for_evaluation=False)

Initializes the HecateConceptSearcher.

Parameters:

Name Type Description Default
for_evaluation bool

If True, configures the searcher for evaluation purposes.

False
Source code in src/ariadne/vector_search/hecate_concept_searcher.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(self, for_evaluation: bool = False):
    """
    Initializes the HecateConceptSearcher.

    Args:
        for_evaluation: If True, configures the searcher for evaluation purposes.
    """
    self.for_evaluation = for_evaluation

    if for_evaluation:
        print("HecateConceptSearcher initialized in evaluation mode.")
        self.default_params = {
            "standard_concept": "S",
            "domain_id": "Condition,Observation,Measurement,Procedure",
            "concept_class_id": "3-dig billing code,3-dig nonbill code,4-dig billing code,Answer,Claims Attachment,Clinical Finding,Clinical Observation,Context-dependent,CPT4,CPT4 Modifier,Disorder,Event,Genetic Variation,HCPCS,Histopattern,ICD10PCS,ICD10PCS Hierarchy,ICDO Condition,ICDO Histology,Ingredient,Lab Test,MDC,Metastasis,MS-DRG,NAACCR Variable,Observable Entity,Procedure,Question,Social Context,Staging / Scales,Staging/Grading,Survey,Topic,Topography,Value,Variable",
            "exclude_vocabulary_id": "ICD9CM,ICD10CM,ICD10,ICD10CN,ICD10GM,CIM10,ICDO3,KCD7,Read",
        }
    else:
        print("HecateConceptSearcher initialized in standard mode.")
        self.default_params = {
            "standard_concept": "S",
        }

search_term(query_string, limit=25)

Searches for concepts matching the given query string.

Parameters:

Name Type Description Default
query_string str

The term to search for.

required
limit int

The maximum number of results to return.

25

Returns:

Type Description
DataFrame

A DataFrame containing the matching concepts, with the same columns as the concept table in the OMOP CDM,

DataFrame

plus a 'score' column indicating the relevance score from the search.

Source code in src/ariadne/vector_search/hecate_concept_searcher.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def search_term(self, query_string: str, limit: int = 25) -> DataFrame:
    """
    Searches for concepts matching the given query string.

    Args:
        query_string: The term to search for.
        limit: The maximum number of results to return.

    Returns:
        A DataFrame containing the matching concepts, with the same columns as the concept table in the OMOP CDM,
        plus a 'score' column indicating the relevance score from the search.

    """

    params = {"q": query_string, "limit": limit}
    params.update(self.default_params)

    try:
        response = requests.get(_HECATE_URL, params=params, timeout=15)
        response.raise_for_status()
        terms = response.json()
        concepts = []
        for term in terms:
            # add score:
            for concept in term.get("concepts", []):
                concept["score"] = term.get("score", None)
            concepts.extend(term.get("concepts", []))
        return pd.DataFrame(concepts)

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response status code: {response.status_code}")
        print(f"Response content: {response.text}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        print(f"The request timed out: {timeout_err}")
    except requests.exceptions.RequestException as err:
        print(f"An unexpected error occurred: {err}")

    return None

search_terms(df, term_column, matched_concept_id_column='matched_concept_id', matched_concept_name_column='matched_concept_name', match_score_column='match_score', match_rank_column='match_rank', limit=25)

Searches the Hecate API for concepts matching terms in a DataFrame column.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing the terms to search for.

required
term_column str

Name of the column with terms to search.

required
matched_concept_id_column str

Name of the column to store matched concept IDs.

'matched_concept_id'
matched_concept_name_column str

Name of the column to store matched concept names.

'matched_concept_name'
match_score_column str

Name of the column to store match scores.

'match_score'
match_rank_column str

Name of the column to store match ranks.

'match_rank'
limit int

The maximum number of results to return for each term.

25

Returns:

Type Description
DataFrame

A DataFrame containing the same columns as the input dataframe plus the matching concepts for each term. For

DataFrame

each term in the input dataframe, multiple rows will be returned corresponding to each matching concept.

Source code in src/ariadne/vector_search/hecate_concept_searcher.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def search_terms(
    self,
    df: pd.DataFrame,
    term_column: str,
    matched_concept_id_column: str = "matched_concept_id",
    matched_concept_name_column: str = "matched_concept_name",
    match_score_column: str = "match_score",
    match_rank_column: str = "match_rank",
    limit: int = 25,
) -> pd.DataFrame:
    """
    Searches the Hecate API for concepts matching terms in a DataFrame column.

    Args:
        df: DataFrame containing the terms to search for.
        term_column: Name of the column with terms to search.
        matched_concept_id_column: Name of the column to store matched concept IDs.
        matched_concept_name_column: Name of the column to store matched concept names.
        match_score_column: Name of the column to store match scores.
        match_rank_column: Name of the column to store match ranks.
        limit: The maximum number of results to return for each term.

    Returns:
        A DataFrame containing the same columns as the input dataframe plus the matching concepts for each term. For
        each term in the input dataframe, multiple rows will be returned corresponding to each matching concept.

    """

    all_results = []
    for index, row in df.iterrows():
        term = row[term_column]
        print(f"Processing term '{term}'")
        results = self.search_term(term, limit=limit)
        if results is not None:
            rows = []
            for rank, (_, concept) in enumerate(results.iterrows(), start=1):
                rows.append(
                    {
                        matched_concept_id_column: concept["concept_id"],
                        matched_concept_name_column: concept["concept_name"],
                        match_score_column: concept["score"],
                        match_rank_column: rank,
                    }
                )
            results = pd.DataFrame(rows)
            results[match_rank_column] = range(1, len(results) + 1)
            orig_cols = list(df.columns)
            new_columns = list(results.columns)
            results[term_column] = term
            for col in df.columns:
                results[col] = row[col]
            results = results[orig_cols + new_columns]
            all_results.append(results)

    all_results = pd.concat(all_results)
    return all_results