vocab_verbatim_term_mapper

`VocabVerbatimTermMapper`

Maps source terms to concept IDs using a pre-built index of normalized terms. The index is created from vocabulary term files stored in Parquet format, downloaded using the download_terms module.

If an index file exists at the verbatim_mapping_index_file path specified in the config, it is loaded.
If not, the index is created by processing all Parquet files in the terms folder specified in the config.

Source code in src/ariadne/verbatim_mapping/vocab_verbatim_term_mapper.py

class VocabVerbatimTermMapper:
    """
    Maps source terms to concept IDs using a pre-built index of normalized terms.
    The index is created from vocabulary term files stored in Parquet format, downloaded using the download_terms
    module.

    1. If an index file exists at the verbatim_mapping_index_file path specified in the config, it is loaded.
    2. If not, the index is created by processing all Parquet files in the terms folder specified in the config.
    """

    def __init__(self, config: Config = Config()):
        self.term_normalizer = TermNormalizer(config)
        if os.path.exists(config.system.verbatim_mapping_index_file):
            with open(config.system.verbatim_mapping_index_file, "rb") as handle:
                self.index = pickle.load(handle)
            print(f"Index loaded from {config.system.verbatim_mapping_index_file}")
        else:
            self._create_index(config)

    def _create_index(self, config: Config):
        print("Creating index")
        if not os.path.exists(config.system.terms_folder):
            raise FileNotFoundError(
                f"Terms folder {config.system.terms_folder} does not exist. Make sure to run the download_terms module first."
            )
        all_files = [
            os.path.join(config.system.terms_folder, f)
            for f in os.listdir(config.system.terms_folder)
            if f.endswith(".parquet")
        ]
        pool = multiprocessing.get_context("spawn").Pool(processes=config.system.max_cores)
        index_data = {}
        for file in all_files:
            print(f"Processing file: {file}")
            df = pd.read_parquet(file)
            normalized_terms = pool.map(self.term_normalizer.normalize_term, df["term"].tolist())
            for norm_term, concept_id, concept_name in zip(
                normalized_terms, df["concept_id"].tolist(), df["concept_name"].tolist()
            ):
                concept = (int(concept_id), concept_name)
                if norm_term in index_data:
                    existing = index_data[norm_term]
                    if isinstance(existing, list):
                        if concept_id not in [c[0] for c in existing]:
                            existing.append(concept)
                    else:
                        if concept_id != existing[0]:
                            index_data[norm_term] = [existing, concept]
                else:
                    index_data[norm_term] = concept

        pool.close()
        self.index = index_data

        try:
            with open(config.system.verbatim_mapping_index_file, "wb") as f:
                pickle.dump(index_data, f)
            print(f"Index saved to {config.system.verbatim_mapping_index_file}")
        except OSError as e:
            print(f"Error saving index: {e}")

    def map_term(self, source_term: str) -> List[tuple[int, str]]:
        """
        Maps a source term to concept IDs using the pre-built index.

        Args:
            source_term: the source clinical term to map

        Returns:
            A list of concept ID - concept name tuples, possibly empty if no match is found.
        """
        normalized_source = self.term_normalizer.normalize_term(source_term)
        if normalized_source in self.index:
            concepts = self.index[normalized_source]
            if isinstance(concepts, list):
                return concepts
            else:
                return [concepts]
        return []

    def map_terms(
        self,
        source_terms: pd.DataFrame,
        term_column: str = "cleaned_term",
        mapped_concept_id_column: str = "mapped_concept_id",
        mapped_concept_name_column: str = "mapped_concept_name",
    ) -> pd.DataFrame:
        """
        Maps source terms in a DataFrame column to concept IDs using the pre-built index.

        Args:
            source_terms: DataFrame containing the source clinical terms to map
            term_column: Name of the column with terms to map
            mapped_concept_id_column: Name of the column to store matched concept IDs.
            mapped_concept_name_column: Name of the column to store matched concept names.

        Returns:
            A DataFrame with the original columns and their mapped concept IDs and names.
        """
        source_terms[[mapped_concept_id_column, mapped_concept_name_column]] = source_terms[term_column].apply(
            lambda term: pd.Series(self.map_term(term)[0] if self.map_term(term) else (-1, ""))
        )
        return source_terms

`map_term(source_term)`

Maps a source term to concept IDs using the pre-built index.

Parameters:

Name	Type	Description	Default
`source_term`	`str`	the source clinical term to map	required

Returns:

Type	Description
`List[tuple[int, str]]`	A list of concept ID - concept name tuples, possibly empty if no match is found.

Source code in src/ariadne/verbatim_mapping/vocab_verbatim_term_mapper.py

def map_term(self, source_term: str) -> List[tuple[int, str]]:
    """
    Maps a source term to concept IDs using the pre-built index.

    Args:
        source_term: the source clinical term to map

    Returns:
        A list of concept ID - concept name tuples, possibly empty if no match is found.
    """
    normalized_source = self.term_normalizer.normalize_term(source_term)
    if normalized_source in self.index:
        concepts = self.index[normalized_source]
        if isinstance(concepts, list):
            return concepts
        else:
            return [concepts]
    return []

`map_terms(source_terms, term_column='cleaned_term', mapped_concept_id_column='mapped_concept_id', mapped_concept_name_column='mapped_concept_name')`

Maps source terms in a DataFrame column to concept IDs using the pre-built index.

Parameters:

Name	Type	Description	Default
`source_terms`	`DataFrame`	DataFrame containing the source clinical terms to map	required
`term_column`	`str`	Name of the column with terms to map	`'cleaned_term'`
`mapped_concept_id_column`	`str`	Name of the column to store matched concept IDs.	`'mapped_concept_id'`
`mapped_concept_name_column`	`str`	Name of the column to store matched concept names.	`'mapped_concept_name'`

Returns:

Type	Description
`DataFrame`	A DataFrame with the original columns and their mapped concept IDs and names.

Source code in src/ariadne/verbatim_mapping/vocab_verbatim_term_mapper.py

def map_terms(
    self,
    source_terms: pd.DataFrame,
    term_column: str = "cleaned_term",
    mapped_concept_id_column: str = "mapped_concept_id",
    mapped_concept_name_column: str = "mapped_concept_name",
) -> pd.DataFrame:
    """
    Maps source terms in a DataFrame column to concept IDs using the pre-built index.

    Args:
        source_terms: DataFrame containing the source clinical terms to map
        term_column: Name of the column with terms to map
        mapped_concept_id_column: Name of the column to store matched concept IDs.
        mapped_concept_name_column: Name of the column to store matched concept names.

    Returns:
        A DataFrame with the original columns and their mapped concept IDs and names.
    """
    source_terms[[mapped_concept_id_column, mapped_concept_name_column]] = source_terms[term_column].apply(
        lambda term: pd.Series(self.map_term(term)[0] if self.map_term(term) else (-1, ""))
    )
    return source_terms