Skip to content

concept_selection_evaluator

evaluate(selection_results, gold_standard_file='data/gold_standards/exact_matching_gs.csv', source_id_column='source_concept_id', term_column='cleaned_term', mapped_concept_id_column='mapped_concept_id', mapped_concept_name_column='mapped_concept_name', mapped_rationale_column='mapped_rationale', mapped_method_column='map_method', source_ids=None)

Evaluate the concept selection results against the gold standard.

Parameters:

Name Type Description Default
selection_results DataFrame

Pandas DataFrame containing the results of selection results.

required
gold_standard_file str

Path to the CSV file containing the gold standard mappings.

'data/gold_standards/exact_matching_gs.csv'
source_id_column str

Name of the column with source concept IDs.

'source_concept_id'
term_column str

Name of the column with source terms.

'cleaned_term'
mapped_concept_id_column str

Name of the column with mapped concept IDs.

'mapped_concept_id'
mapped_concept_name_column str

Name of the column with mapped concept names.

'mapped_concept_name'
mapped_method_column Optional[str]

Optional: Name of the column with mapping methods, e.g. "verbatim" or "llm".

'map_method'
mapped_rationale_column Optional[str]

Optional: Name of the column with mapping rationales.

'mapped_rationale'
source_ids Optional[List[int]]

Optional list of source concept IDs to evaluate. If None, evaluate all.

None

Returns:

Type Description
DataFrame

A Pandas DataFrame with the evaluation results.

Source code in src/ariadne/evaluation/concept_selection_evaluator.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def evaluate(
    selection_results: pd.DataFrame,
    gold_standard_file: str = "data/gold_standards/exact_matching_gs.csv",
    source_id_column: str = "source_concept_id",
    term_column: str = "cleaned_term",
    mapped_concept_id_column: str = "mapped_concept_id",
    mapped_concept_name_column: str = "mapped_concept_name",
    mapped_rationale_column: Optional[str] = "mapped_rationale",
    mapped_method_column: Optional[str] = "map_method",
    source_ids: Optional[List[int]] = None,
) -> pd.DataFrame:
    """
    Evaluate the concept selection results against the gold standard.

    Args:
        selection_results: Pandas DataFrame containing the results of selection results.
        gold_standard_file: Path to the CSV file containing the gold standard mappings.
        source_id_column: Name of the column with source concept IDs.
        term_column: Name of the column with source terms.
        mapped_concept_id_column: Name of the column with mapped concept IDs.
        mapped_concept_name_column: Name of the column with mapped concept names.
        mapped_method_column: Optional: Name of the column with mapping methods, e.g. "verbatim" or "llm".
        mapped_rationale_column: Optional: Name of the column with mapping rationales.
        source_ids: Optional list of source concept IDs to evaluate. If None, evaluate all.

    Returns:
        A Pandas DataFrame with the evaluation results.
    """
    gold_standard = pd.read_csv(resolve_path(gold_standard_file))

    if mapped_method_column:
        output_mapped_method_column = mapped_method_column
    else:
        output_mapped_method_column = "map_method"

    selection_results.reset_index(drop=True, inplace=True)
    evaluation_results = []
    for index, row in selection_results.iterrows():
        source_id = int(row[source_id_column])
        if source_ids is not None and source_id not in source_ids:
            continue

        gold_entry = gold_standard[gold_standard[SOURCE_CONCEPT_ID] == source_id]
        if gold_entry.empty:
            continue
        gold_entry = gold_entry.iloc[0]
        gold_target_concept_id = gold_entry[TARGET_CONCEPT_ID]
        gold_target_concept_id_b = gold_entry[TARGET_CONCEPT_ID_B]
        gold_predicate = gold_entry[PREDICATE]
        gold_predicate_b = gold_entry[PREDICATE_B]

        mapped_concept_id = int(row[mapped_concept_id_column])
        if mapped_method_column and mapped_method_column in row:
            map_method = row[mapped_method_column]
        else:
            map_method = "unknown"
        is_correct = (
            (mapped_concept_id == gold_target_concept_id and gold_predicate == EXACT_MATCH)
            or (mapped_concept_id == gold_target_concept_id_b and gold_predicate_b == EXACT_MATCH)
            or (mapped_concept_id == -1 and gold_predicate == BROAD_MATCH)
            or (mapped_concept_id == -1 and gold_predicate_b == BROAD_MATCH)
        )
        result_row = {
            SOURCE_CONCEPT_ID: source_id,
            SOURCE_TERM: gold_entry.get(SOURCE_TERM),
            output_mapped_method_column: map_method,
            TARGET_CONCEPT_ID: gold_target_concept_id,
            TARGET_CONCEPT_NAME: gold_entry.get(TARGET_CONCEPT_NAME),
            PREDICATE: gold_predicate,
            TARGET_CONCEPT_ID_B: gold_target_concept_id_b,
            TARGET_CONCEPT_NAME_B: gold_entry.get(TARGET_CONCEPT_NAME_B),
            PREDICATE_B: gold_predicate_b,
        }
        if term_column != SOURCE_TERM:
            result_row[term_column] = row[term_column]
        result_row.update(
            {
                mapped_concept_id_column: mapped_concept_id,
                mapped_concept_name_column: row[mapped_concept_name_column],
                "is_correct": is_correct,
            }
        )
        if mapped_rationale_column and mapped_rationale_column in row:
            result_row[mapped_rationale_column] = row[mapped_rationale_column]

        evaluation_results.append(result_row)
    evaluation_df = pd.DataFrame(evaluation_results)

    # Add overall accuracy as a column:
    accuracy = evaluation_df["is_correct"].mean()
    evaluation_df["overall_accuracy"] = accuracy

    return evaluation_df