GeneLab_Data_Processing
GeneLab_Data_Processing copied to clipboard
[BulkRNASeq] VV_DESEQ2_ANALYSIS error with all integer sample names
Existing code in RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dp_tools__NF_RCP/checks.py:
def check_dge_table_group_columns_constraints(
dge_table: Path, runsheet: Path, samples: set[str], **_
) -> FlagEntry:
FLOAT_TOLERANCE = (
0.001 # Percent allowed difference due to float precision differences
)
# data specific preprocess
GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"]
expected_groups = utils_runsheet_to_expected_groups(runsheet)
query_columns = {
"".join(comb)
for comb in itertools.product(GROUP_PREFIXES, expected_groups.values())
}
expected_group_lists = utils_runsheet_to_expected_groups(
runsheet, map_to_lists=True, limit_to_samples=samples
)
df_dge = pd.read_csv(dge_table)
# issue trackers
issues: dict[str, list[str]] = {
f"mean computation deviates by more than {FLOAT_TOLERANCE} percent": [],
f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent": [],
}
group: str
sample_set: list[str]
for group, sample_set in expected_group_lists.items():
abs_percent_differences = abs(
(df_dge[f"Group.Mean_{group}"] - df_dge[sample_set].mean(axis="columns"))
/ df_dge[sample_set].mean(axis="columns")
* 100
)
if any(abs_percent_differences > FLOAT_TOLERANCE):
issues[
f"mean computation deviates by more than {FLOAT_TOLERANCE} percent"
].append(group)
abs_percent_differences = abs(
(df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set].std(axis="columns"))
/ df_dge[sample_set].mean(axis="columns")
* 100
)
if any(abs_percent_differences > FLOAT_TOLERANCE):
issues[
f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent"
].append(group)
# check logic
contraint_description = f"Group mean and standard deviations are correctly computed from samplewise normalized counts within a tolerance of {FLOAT_TOLERANCE} percent (to accomodate minor float related differences )"
if not any([issue_type for issue_type in issues.values()]):
code = FlagCode.GREEN
message = f"All values in columns: {query_columns} met constraint: {contraint_description}"
else:
code = FlagCode.HALT
message = (
f"Issues found {issues} that"
f"fail the contraint: {contraint_description}."
)
return {"code": code, "message": message}
Suggestion: remove list[str] typing hint from samples, cast sample names in sample_names to string
group: str
sample_set: list
for group, sample_set in expected_group_lists.items():
sample_set_str = [str(sample) for sample in sample_set] # Convert sample names to strings
abs_percent_differences = abs(
(df_dge[f"Group.Mean_{group}"] - df_dge[sample_set_str].mean(axis="columns"))
/ df_dge[sample_set_str].mean(axis="columns")
* 100
)
if any(abs_percent_differences > FLOAT_TOLERANCE):
issues[
f"mean computation deviates by more than {FLOAT_TOLERANCE} percent"
].append(group)
abs_percent_differences = abs(
(df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set_str].std(axis="columns"))
/ df_dge[sample_set_str].mean(axis="columns")
* 100
)
if any(abs_percent_differences > FLOAT_TOLERANCE):
issues[
f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent"
].append(group)