hecate_concept_searcher

`HecateConceptSearcher`

Bases: AbstractConceptSearcher

A concept searcher that uses the OHDSI Hecate API to find concepts based on query strings.

Source code in src/ariadne/vector_search/hecate_concept_searcher.py

class HecateConceptSearcher(AbstractConceptSearcher):

    """
    A concept searcher that uses the OHDSI Hecate API to find concepts based on query strings.
    """

    def __init__(self, for_evaluation: bool = False):
        """
        Initializes the HecateConceptSearcher.

        Args:
            for_evaluation: If True, configures the searcher for evaluation purposes.
        """
        self.for_evaluation = for_evaluation

        if for_evaluation:
            print("HecateConceptSearcher initialized in evaluation mode.")
            self.default_params = {
                "standard_concept": "S",
                "domain_id": "Condition,Observation,Measurement,Procedure",
                "concept_class_id": "3-dig billing code,3-dig nonbill code,4-dig billing code,Answer,Claims Attachment,Clinical Finding,Clinical Observation,Context-dependent,CPT4,CPT4 Modifier,Disorder,Event,Genetic Variation,HCPCS,Histopattern,ICD10PCS,ICD10PCS Hierarchy,ICDO Condition,ICDO Histology,Ingredient,Lab Test,MDC,Metastasis,MS-DRG,NAACCR Variable,Observable Entity,Procedure,Question,Social Context,Staging / Scales,Staging/Grading,Survey,Topic,Topography,Value,Variable",
                "exclude_vocabulary_id": "ICD9CM,ICD10CM,ICD10,ICD10CN,ICD10GM,CIM10,ICDO3,KCD7,Read",
            }
        else:
            print("HecateConceptSearcher initialized in standard mode.")
            self.default_params = {
                "standard_concept": "S",
            }

    def search_term(self, query_string: str, limit: int = 25) -> DataFrame:
        """
        Searches for concepts matching the given query string.

        Args:
            query_string: The term to search for.
            limit: The maximum number of results to return.

        Returns:
            A DataFrame containing the matching concepts, with the same columns as the concept table in the OMOP CDM,
            plus a 'score' column indicating the relevance score from the search.

        """

        params = {"q": query_string, "limit": limit}
        params.update(self.default_params)

        try:
            response = requests.get(_HECATE_URL, params=params, timeout=15)
            response.raise_for_status()
            terms = response.json()
            concepts = []
            for term in terms:
                # add score:
                for concept in term.get("concepts", []):
                    concept["score"] = term.get("score", None)
                concepts.extend(term.get("concepts", []))
            return pd.DataFrame(concepts)

        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred: {http_err}")
            print(f"Response status code: {response.status_code}")
            print(f"Response content: {response.text}")
        except requests.exceptions.ConnectionError as conn_err:
            print(f"Connection error occurred: {conn_err}")
        except requests.exceptions.Timeout as timeout_err:
            print(f"The request timed out: {timeout_err}")
        except requests.exceptions.RequestException as err:
            print(f"An unexpected error occurred: {err}")

        return None

    def search_terms(
        self,
        df: pd.DataFrame,
        term_column: str,
        matched_concept_id_column: str = "matched_concept_id",
        matched_concept_name_column: str = "matched_concept_name",
        match_score_column: str = "match_score",
        match_rank_column: str = "match_rank",
        limit: int = 25,
    ) -> pd.DataFrame:
        """
        Searches the Hecate API for concepts matching terms in a DataFrame column.

        Args:
            df: DataFrame containing the terms to search for.
            term_column: Name of the column with terms to search.
            matched_concept_id_column: Name of the column to store matched concept IDs.
            matched_concept_name_column: Name of the column to store matched concept names.
            match_score_column: Name of the column to store match scores.
            match_rank_column: Name of the column to store match ranks.
            limit: The maximum number of results to return for each term.

        Returns:
            A DataFrame containing the same columns as the input dataframe plus the matching concepts for each term. For
            each term in the input dataframe, multiple rows will be returned corresponding to each matching concept.

        """

        all_results = []
        for index, row in df.iterrows():
            term = row[term_column]
            print(f"Processing term '{term}'")
            results = self.search_term(term, limit=limit)
            if results is not None:
                rows = []
                for rank, (_, concept) in enumerate(results.iterrows(), start=1):
                    rows.append(
                        {
                            matched_concept_id_column: concept["concept_id"],
                            matched_concept_name_column: concept["concept_name"],
                            match_score_column: concept["score"],
                            match_rank_column: rank,
                        }
                    )
                results = pd.DataFrame(rows)
                results[match_rank_column] = range(1, len(results) + 1)
                orig_cols = list(df.columns)
                new_columns = list(results.columns)
                results[term_column] = term
                for col in df.columns:
                    results[col] = row[col]
                results = results[orig_cols + new_columns]
                all_results.append(results)

        all_results = pd.concat(all_results)
        return all_results

`init(for_evaluation=False)`

Initializes the HecateConceptSearcher.

Parameters:

Name	Type	Description	Default
`for_evaluation`	`bool`	If True, configures the searcher for evaluation purposes.	`False`

Source code in src/ariadne/vector_search/hecate_concept_searcher.py

def __init__(self, for_evaluation: bool = False):
    """
    Initializes the HecateConceptSearcher.

    Args:
        for_evaluation: If True, configures the searcher for evaluation purposes.
    """
    self.for_evaluation = for_evaluation

    if for_evaluation:
        print("HecateConceptSearcher initialized in evaluation mode.")
        self.default_params = {
            "standard_concept": "S",
            "domain_id": "Condition,Observation,Measurement,Procedure",
            "concept_class_id": "3-dig billing code,3-dig nonbill code,4-dig billing code,Answer,Claims Attachment,Clinical Finding,Clinical Observation,Context-dependent,CPT4,CPT4 Modifier,Disorder,Event,Genetic Variation,HCPCS,Histopattern,ICD10PCS,ICD10PCS Hierarchy,ICDO Condition,ICDO Histology,Ingredient,Lab Test,MDC,Metastasis,MS-DRG,NAACCR Variable,Observable Entity,Procedure,Question,Social Context,Staging / Scales,Staging/Grading,Survey,Topic,Topography,Value,Variable",
            "exclude_vocabulary_id": "ICD9CM,ICD10CM,ICD10,ICD10CN,ICD10GM,CIM10,ICDO3,KCD7,Read",
        }
    else:
        print("HecateConceptSearcher initialized in standard mode.")
        self.default_params = {
            "standard_concept": "S",
        }

`search_term(query_string, limit=25)`

Searches for concepts matching the given query string.

Parameters:

Name	Type	Description	Default
`query_string`	`str`	The term to search for.	required
`limit`	`int`	The maximum number of results to return.	`25`

Returns:

Type	Description
`DataFrame`	A DataFrame containing the matching concepts, with the same columns as the concept table in the OMOP CDM,
`DataFrame`	plus a 'score' column indicating the relevance score from the search.

Source code in src/ariadne/vector_search/hecate_concept_searcher.py

def search_term(self, query_string: str, limit: int = 25) -> DataFrame:
    """
    Searches for concepts matching the given query string.

    Args:
        query_string: The term to search for.
        limit: The maximum number of results to return.

    Returns:
        A DataFrame containing the matching concepts, with the same columns as the concept table in the OMOP CDM,
        plus a 'score' column indicating the relevance score from the search.

    """

    params = {"q": query_string, "limit": limit}
    params.update(self.default_params)

    try:
        response = requests.get(_HECATE_URL, params=params, timeout=15)
        response.raise_for_status()
        terms = response.json()
        concepts = []
        for term in terms:
            # add score:
            for concept in term.get("concepts", []):
                concept["score"] = term.get("score", None)
            concepts.extend(term.get("concepts", []))
        return pd.DataFrame(concepts)

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response status code: {response.status_code}")
        print(f"Response content: {response.text}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        print(f"The request timed out: {timeout_err}")
    except requests.exceptions.RequestException as err:
        print(f"An unexpected error occurred: {err}")

    return None

`search_terms(df, term_column, matched_concept_id_column='matched_concept_id', matched_concept_name_column='matched_concept_name', match_score_column='match_score', match_rank_column='match_rank', limit=25)`

Searches the Hecate API for concepts matching terms in a DataFrame column.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame containing the terms to search for.	required
`term_column`	`str`	Name of the column with terms to search.	required
`matched_concept_id_column`	`str`	Name of the column to store matched concept IDs.	`'matched_concept_id'`
`matched_concept_name_column`	`str`	Name of the column to store matched concept names.	`'matched_concept_name'`
`match_score_column`	`str`	Name of the column to store match scores.	`'match_score'`
`match_rank_column`	`str`	Name of the column to store match ranks.	`'match_rank'`
`limit`	`int`	The maximum number of results to return for each term.	`25`

Returns:

Type	Description
`DataFrame`	A DataFrame containing the same columns as the input dataframe plus the matching concepts for each term. For
`DataFrame`	each term in the input dataframe, multiple rows will be returned corresponding to each matching concept.

Source code in src/ariadne/vector_search/hecate_concept_searcher.py

def search_terms(
    self,
    df: pd.DataFrame,
    term_column: str,
    matched_concept_id_column: str = "matched_concept_id",
    matched_concept_name_column: str = "matched_concept_name",
    match_score_column: str = "match_score",
    match_rank_column: str = "match_rank",
    limit: int = 25,
) -> pd.DataFrame:
    """
    Searches the Hecate API for concepts matching terms in a DataFrame column.

    Args:
        df: DataFrame containing the terms to search for.
        term_column: Name of the column with terms to search.
        matched_concept_id_column: Name of the column to store matched concept IDs.
        matched_concept_name_column: Name of the column to store matched concept names.
        match_score_column: Name of the column to store match scores.
        match_rank_column: Name of the column to store match ranks.
        limit: The maximum number of results to return for each term.

    Returns:
        A DataFrame containing the same columns as the input dataframe plus the matching concepts for each term. For
        each term in the input dataframe, multiple rows will be returned corresponding to each matching concept.

    """

    all_results = []
    for index, row in df.iterrows():
        term = row[term_column]
        print(f"Processing term '{term}'")
        results = self.search_term(term, limit=limit)
        if results is not None:
            rows = []
            for rank, (_, concept) in enumerate(results.iterrows(), start=1):
                rows.append(
                    {
                        matched_concept_id_column: concept["concept_id"],
                        matched_concept_name_column: concept["concept_name"],
                        match_score_column: concept["score"],
                        match_rank_column: rank,
                    }
                )
            results = pd.DataFrame(rows)
            results[match_rank_column] = range(1, len(results) + 1)
            orig_cols = list(df.columns)
            new_columns = list(results.columns)
            results[term_column] = term
            for col in df.columns:
                results[col] = row[col]
            results = results[orig_cols + new_columns]
            all_results.append(results)

    all_results = pd.concat(all_results)
    return all_results

hecate_concept_searcher