Skip to content

vocab_verbatim_term_mapper

VocabVerbatimTermMapper

Maps source terms to concept IDs using a pre-built index of normalized terms. The index is created from vocabulary term files stored in Parquet format, downloaded using the download_terms module.

  1. If an index file exists at the verbatim_mapping_index_file path specified in the config, it is loaded.
  2. If not, the index is created by processing all Parquet files in the terms folder specified in the config.
Source code in src/ariadne/verbatim_mapping/vocab_verbatim_term_mapper.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
class VocabVerbatimTermMapper:
    """
    Maps source terms to concept IDs using a pre-built index of normalized terms.
    The index is created from vocabulary term files stored in Parquet format, downloaded using the download_terms
    module.

    1. If an index file exists at the verbatim_mapping_index_file path specified in the config, it is loaded.
    2. If not, the index is created by processing all Parquet files in the terms folder specified in the config.
    """

    def __init__(self, config: Config = Config()):
        self.term_normalizer = TermNormalizer(config)
        if os.path.exists(config.system.verbatim_mapping_index_file):
            with open(config.system.verbatim_mapping_index_file, "rb") as handle:
                self.index = pickle.load(handle)
            print(f"Index loaded from {config.system.verbatim_mapping_index_file}")
        else:
            self._create_index(config)

    def _create_index(self, config: Config):
        print("Creating index")
        if not os.path.exists(config.system.terms_folder):
            raise FileNotFoundError(
                f"Terms folder {config.system.terms_folder} does not exist. Make sure to run the download_terms module first."
            )
        all_files = [
            os.path.join(config.system.terms_folder, f)
            for f in os.listdir(config.system.terms_folder)
            if f.endswith(".parquet")
        ]
        pool = multiprocessing.get_context("spawn").Pool(processes=config.system.max_cores)
        index_data = {}
        for file in all_files:
            print(f"Processing file: {file}")
            df = pd.read_parquet(file)
            normalized_terms = pool.map(self.term_normalizer.normalize_term, df["term"].tolist())
            for norm_term, concept_id, concept_name in zip(
                normalized_terms, df["concept_id"].tolist(), df["concept_name"].tolist()
            ):
                concept = (int(concept_id), concept_name)
                if norm_term in index_data:
                    existing = index_data[norm_term]
                    if isinstance(existing, list):
                        if concept_id not in [c[0] for c in existing]:
                            existing.append(concept)
                    else:
                        if concept_id != existing[0]:
                            index_data[norm_term] = [existing, concept]
                else:
                    index_data[norm_term] = concept

        pool.close()
        self.index = index_data

        try:
            with open(config.system.verbatim_mapping_index_file, "wb") as f:
                pickle.dump(index_data, f)
            print(f"Index saved to {config.system.verbatim_mapping_index_file}")
        except OSError as e:
            print(f"Error saving index: {e}")

    def map_term(self, source_term: str) -> List[tuple[int, str]]:
        """
        Maps a source term to concept IDs using the pre-built index.

        Args:
            source_term: the source clinical term to map

        Returns:
            A list of concept ID - concept name tuples, possibly empty if no match is found.
        """
        normalized_source = self.term_normalizer.normalize_term(source_term)
        if normalized_source in self.index:
            concepts = self.index[normalized_source]
            if isinstance(concepts, list):
                return concepts
            else:
                return [concepts]
        return []

    def map_terms(
        self,
        source_terms: pd.DataFrame,
        term_column: str = "cleaned_term",
        mapped_concept_id_column: str = "mapped_concept_id",
        mapped_concept_name_column: str = "mapped_concept_name",
    ) -> pd.DataFrame:
        """
        Maps source terms in a DataFrame column to concept IDs using the pre-built index.

        Args:
            source_terms: DataFrame containing the source clinical terms to map
            term_column: Name of the column with terms to map
            mapped_concept_id_column: Name of the column to store matched concept IDs.
            mapped_concept_name_column: Name of the column to store matched concept names.

        Returns:
            A DataFrame with the original columns and their mapped concept IDs and names.
        """
        source_terms[[mapped_concept_id_column, mapped_concept_name_column]] = source_terms[term_column].apply(
            lambda term: pd.Series(self.map_term(term)[0] if self.map_term(term) else (-1, ""))
        )
        return source_terms

map_term(source_term)

Maps a source term to concept IDs using the pre-built index.

Parameters:

Name Type Description Default
source_term str

the source clinical term to map

required

Returns:

Type Description
List[tuple[int, str]]

A list of concept ID - concept name tuples, possibly empty if no match is found.

Source code in src/ariadne/verbatim_mapping/vocab_verbatim_term_mapper.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def map_term(self, source_term: str) -> List[tuple[int, str]]:
    """
    Maps a source term to concept IDs using the pre-built index.

    Args:
        source_term: the source clinical term to map

    Returns:
        A list of concept ID - concept name tuples, possibly empty if no match is found.
    """
    normalized_source = self.term_normalizer.normalize_term(source_term)
    if normalized_source in self.index:
        concepts = self.index[normalized_source]
        if isinstance(concepts, list):
            return concepts
        else:
            return [concepts]
    return []

map_terms(source_terms, term_column='cleaned_term', mapped_concept_id_column='mapped_concept_id', mapped_concept_name_column='mapped_concept_name')

Maps source terms in a DataFrame column to concept IDs using the pre-built index.

Parameters:

Name Type Description Default
source_terms DataFrame

DataFrame containing the source clinical terms to map

required
term_column str

Name of the column with terms to map

'cleaned_term'
mapped_concept_id_column str

Name of the column to store matched concept IDs.

'mapped_concept_id'
mapped_concept_name_column str

Name of the column to store matched concept names.

'mapped_concept_name'

Returns:

Type Description
DataFrame

A DataFrame with the original columns and their mapped concept IDs and names.

Source code in src/ariadne/verbatim_mapping/vocab_verbatim_term_mapper.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def map_terms(
    self,
    source_terms: pd.DataFrame,
    term_column: str = "cleaned_term",
    mapped_concept_id_column: str = "mapped_concept_id",
    mapped_concept_name_column: str = "mapped_concept_name",
) -> pd.DataFrame:
    """
    Maps source terms in a DataFrame column to concept IDs using the pre-built index.

    Args:
        source_terms: DataFrame containing the source clinical terms to map
        term_column: Name of the column with terms to map
        mapped_concept_id_column: Name of the column to store matched concept IDs.
        mapped_concept_name_column: Name of the column to store matched concept names.

    Returns:
        A DataFrame with the original columns and their mapped concept IDs and names.
    """
    source_terms[[mapped_concept_id_column, mapped_concept_name_column]] = source_terms[term_column].apply(
        lambda term: pd.Series(self.map_term(term)[0] if self.map_term(term) else (-1, ""))
    )
    return source_terms