Skip to content

drug_mapper

DrugMapper

Source code in src/ariadne/llm_mapping/drug_mapper.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
class DrugMapper:
    def __init__(self, config: ConfigDrugMapping = ConfigDrugMapping()):
        self.config = config

    @staticmethod
    def _validate_input_columns(df: pd.DataFrame) -> None:
        required_columns = {"concept_name", "concept_class_id", "concept_code"}
        missing_columns = sorted(required_columns - set(df.columns))
        if missing_columns:
            raise ValueError(f"drug_concept_stage is missing required columns: {missing_columns}")

    @staticmethod
    def _hecate_kwargs(scf: StandardConceptFilter) -> dict:
        """Build keyword arguments for :class:`HecateConceptSearcher` from a filter."""
        return dict(
            standard_concept="S" if scf.standard_concept else "None",
            domain_ids=scf.domain_ids,
            concept_class_ids=scf.concept_class_ids,
            vocabulary_ids=scf.vocabularies,
        )

    def _map_class_rows(self, class_rows: pd.DataFrame, cc: MappingPerConceptClassSettings) -> pd.DataFrame:
        vm_settings = cc.verbatim_mapping
        vs_settings = cc.vector_search
        llm_settings = cc.llm_mapping

        download_terms(settings=vm_settings)
        verbatim_mapper = VocabVerbatimTermMapper(settings=vm_settings)

        work_df = class_rows[["concept_code", "concept_name"]].copy()
        work_df = verbatim_mapper.map_terms(
            source_terms=work_df,
            term_column="concept_name",
            mapped_concept_id_column="mapped_concept_id",
            mapped_concept_name_column="mapped_concept_name",
        )

        unmatched = work_df[work_df["mapped_concept_id"] == -1].copy()
        if not unmatched.empty:
            hecate_kwargs = self._hecate_kwargs(vm_settings.standard_concept_filter)
            hecate = HecateConceptSearcher(**hecate_kwargs)
            candidates = hecate.search_terms(
                unmatched,
                term_column="concept_name",
                limit=vs_settings.max_candidates,
                standard_concept=hecate_kwargs["standard_concept"],
                domain_ids=hecate_kwargs["domain_ids"],
                concept_class_ids=hecate_kwargs["concept_class_ids"],
                vocabulary_ids=hecate_kwargs["vocabulary_ids"],
            )

            if not candidates.empty:
                context_cfg = llm_settings.context
                candidates = add_concept_context(
                    concept_table=candidates,
                    add_parents=True,
                    add_children=False,
                    add_synonyms=True,
                )
                llm_settings.context.include_target_children = False
                mapper = LlmMapper(settings=llm_settings)
                llm_matches = mapper.map_terms(
                    source_target_concepts=candidates,
                    source_id_column="concept_code",
                    term_column="concept_name",
                    source_term_column="concept_name",
                    children_column=None
                )
                if not llm_matches.empty:
                    work_df = pd.concat([
                        work_df[work_df["mapped_concept_id"] != -1],
                        llm_matches[["concept_code", "concept_name", "mapped_concept_id", "mapped_concept_name"]]
                    ])
        work_df["mapped_concept_id"] = work_df["mapped_concept_id"].apply(
            lambda value: int(value) if pd.notna(value) and int(value) != -1 else None
        )
        return work_df

    def map_drug_concepts(self, drug_concept_stage: pd.DataFrame) -> pd.DataFrame:
        self._validate_input_columns(drug_concept_stage)

        relevant_rows = drug_concept_stage[drug_concept_stage["concept_class_id"].isin(_SUPPORTED_CONCEPT_CLASSES)].copy()
        if relevant_rows.empty:
            return pd.DataFrame(columns=["concept_code_1", "concept_id"])

        mapped_batches = []
        for concept_class_id in _SUPPORTED_CONCEPT_CLASSES:
            class_rows = relevant_rows[relevant_rows["concept_class_id"] == concept_class_id]
            if class_rows.empty:
                continue

            config_key = _CONCEPT_CLASS_TO_CONFIG_KEY[concept_class_id]
            if config_key not in self.config.mapping_per_concept_class:
                raise ValueError(
                    f"Missing mapping_per_concept_class config for '{config_key}' ({concept_class_id})"
                )

            cc = self.config.mapping_per_concept_class[config_key]
            class_mapped = self._map_class_rows(class_rows, cc)
            mapped_batches.append(class_mapped)

        if not mapped_batches:
            return pd.DataFrame(columns=["concept_code", "source_name", "concept_id", "concept_name"])

        relationship_to_concept = pd.concat(mapped_batches, ignore_index=True)
        relationship_to_concept = relationship_to_concept.rename(columns={
            "concept_name": "source_name",
            "mappend_concept_id": "concept_id",
            "mappend_concept_name": "concept_name",
        })
        return relationship_to_concept