GeneLab_Data_Processing icon indicating copy to clipboard operation
GeneLab_Data_Processing copied to clipboard

[BulkRNASeq] VV_DESEQ2_ANALYSIS error with all integer sample names

Open torres-alexis opened this issue 1 year ago • 0 comments

Existing code in RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py:

def check_dge_table_group_columns_constraints(
    dge_table: Path, runsheet: Path, samples: set[str], **_
) -> FlagEntry:
    FLOAT_TOLERANCE = (
        0.001  # Percent allowed difference due to float precision differences
    )
    # data specific preprocess
    GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"]
    expected_groups = utils_runsheet_to_expected_groups(runsheet)
    query_columns = {
        "".join(comb)
        for comb in itertools.product(GROUP_PREFIXES, expected_groups.values())
    }
    expected_group_lists = utils_runsheet_to_expected_groups(
        runsheet, map_to_lists=True, limit_to_samples=samples
    )
    df_dge = pd.read_csv(dge_table)
    # issue trackers
    issues: dict[str, list[str]] = {
        f"mean computation deviates by more than {FLOAT_TOLERANCE} percent": [],
        f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent": [],
    }
    group: str
    sample_set: list[str]
    for group, sample_set in expected_group_lists.items():
        abs_percent_differences = abs(
            (df_dge[f"Group.Mean_{group}"] - df_dge[sample_set].mean(axis="columns"))
            / df_dge[sample_set].mean(axis="columns")
            * 100
        )
        if any(abs_percent_differences > FLOAT_TOLERANCE):
            issues[
                f"mean computation deviates by more than {FLOAT_TOLERANCE} percent"
            ].append(group)

        abs_percent_differences = abs(
            (df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set].std(axis="columns"))
            / df_dge[sample_set].mean(axis="columns")
            * 100
        )
        if any(abs_percent_differences > FLOAT_TOLERANCE):
            issues[
                f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent"
            ].append(group)
    # check logic
    contraint_description = f"Group mean and standard deviations are correctly computed from samplewise normalized counts within a tolerance of {FLOAT_TOLERANCE} percent (to accomodate minor float related differences )"
    if not any([issue_type for issue_type in issues.values()]):
        code = FlagCode.GREEN
        message = f"All values in columns: {query_columns} met constraint: {contraint_description}"
    else:
        code = FlagCode.HALT
        message = (
            f"Issues found {issues} that"
            f"fail the contraint: {contraint_description}."
        )
    return {"code": code, "message": message}

Suggestion: remove list[str] typing hint from samples, cast sample names in sample_names to string

    group: str
    sample_set: list
    for group, sample_set in expected_group_lists.items():
        sample_set_str = [str(sample) for sample in sample_set]  # Convert sample names to strings
        abs_percent_differences = abs(
            (df_dge[f"Group.Mean_{group}"] - df_dge[sample_set_str].mean(axis="columns"))
            / df_dge[sample_set_str].mean(axis="columns")
            * 100
        )
        if any(abs_percent_differences > FLOAT_TOLERANCE):
            issues[
                f"mean computation deviates by more than {FLOAT_TOLERANCE} percent"
            ].append(group)

        abs_percent_differences = abs(
            (df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set_str].std(axis="columns"))
            / df_dge[sample_set_str].mean(axis="columns")
            * 100
        )
        if any(abs_percent_differences > FLOAT_TOLERANCE):
            issues[
                f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent"
            ].append(group)

torres-alexis avatar Aug 20 '24 08:08 torres-alexis