Ghidra model producing less variables than expected in the wild
In a discussion with @edmcman here: https://github.com/mahaloz/DAILA/pull/32#issuecomment-1961793064
We have both independently noticed VarBERT is producing fewer variables than expected on binaries such as /bin/ls. It's unclear now which of the following (combinations?) is the cause:
- A bug in how we are passing data to VarBERT
- A bad choice of model to use at-scale
- An incorrect model
We should investigate this to make the real-world use better. It may also be worth looking at our Ghidra testcases again.
I can upload specific examples if it is helpful.
@edmcman that would be helpful, yes
Here's zip containing a Ghidra gzf: file.zip
int * FUN_1000047b4(int level,char **argc,uint flags)
{
int iVar1;
code *UNRECOVERED_JUMPTABLE_00;
uint uVar2;
FTS *pFVar3;
FTSENT *pFVar4;
int *piVar5;
FTSENT *pFVar6;
int iVar7;
ulong unaff_x30;
FTSENT *r;
UNRECOVERED_JUMPTABLE_00 = (code *)0x0;
if (DAT_10000c0d8 == '\0') {
UNRECOVERED_JUMPTABLE_00 = FUN_100004abc;
}
pFVar3 = _fts_open(argc,flags,(int *)UNRECOVERED_JUMPTABLE_00);
if (pFVar3 == (FTS *)0x0) {
FUN_1000071dc();
}
else {
pFVar4 = _fts_children(pFVar3,0);
if (pFVar4 != (FTSENT *)0x0) {
FUN_100004b3c(0,pFVar4);
}
if (DAT_10000c0f8 == '\x01') {
if (((unaff_x30 ^ unaff_x30 << 1) >> 0x3e & 1) != 0) {
/* WARNING: Treating indirect jump as call */
UNRECOVERED_JUMPTABLE_00 = (code *)SoftwareBreakpoint(0xc471,0x100004860);
piVar5 = (int *)(*UNRECOVERED_JUMPTABLE_00)();
return piVar5;
}
uVar2 = _fts_close(pFVar3);
return (int *)(ulong)uVar2;
}
iVar7 = 0;
if (((flags & 8) == 0 | DAT_10000c0f0 & 1) == 0) {
iVar7 = 0x100;
}
piVar5 = ___error();
*piVar5 = 0;
pFVar4 = _fts_read(pFVar3);
while (pFVar4 != (FTSENT *)0x0) {
if (pFVar4->fts_info - 1 < 0xd) {
switch((uint)pFVar4->fts_info) {
default:
if ((((pFVar4->fts_level == 0) || (pFVar4->fts_name[0] != '.')) ||
((DAT_10000c0dc & 1) != 0)) &&
(((DAT_10000c088 == 0 || (pFVar4->fts_statp == (stat *)0x0)) ||
((*(byte *)((long)&pFVar4->fts_statp->st_flags + 3) >> 6 & 1) == 0)))) {
if (DAT_10000c11c == '\x01') {
_putchar(10);
FUN_1000055f8(pFVar4->fts_path);
_puts(":");
}
else if (1 < level) {
FUN_1000055f8(pFVar4->fts_path);
_puts(":");
DAT_10000c11c = '\x01';
}
pFVar6 = _fts_children(pFVar3,iVar7);
if (((flags >> 1 & 1) != 0) && (r = pFVar6, DAT_10000c024 != '\0')) {
for (; r != (FTSENT *)0x0; r = r->fts_link) {
if (r->fts_info == 0xd) {
r->fts_number = 1;
}
}
}
FUN_100004b3c(pFVar4,pFVar6);
if (DAT_10000c0f0 == 1 || pFVar6 == (FTSENT *)0x0) break;
}
_fts_set(pFVar3,pFVar4,4);
break;
case 2:
FUN_100007214(pFVar4);
break;
case 3:
case 5:
case 8:
case 9:
case 10:
case 0xb:
case 0xc:
break;
case 4:
case 7:
FUN_100007254(pFVar4);
break;
case 6:
iVar1 = pFVar4->fts_errno;
if (iVar1 != 0) {
piVar5 = ___error();
*piVar5 = iVar1;
goto LAB_100004a28;
}
break;
case 0xd:
if (((flags >> 1 & 1) != 0) && (DAT_10000c024 != '\0')) {
FUN_100007298(pFVar4);
}
}
}
piVar5 = ___error();
*piVar5 = 0;
pFVar4 = _fts_read(pFVar3);
}
LAB_100004a28:
piVar5 = ___error();
if (*piVar5 == 0x51) {
iVar7 = 1;
}
else {
piVar5 = ___error();
iVar7 = *piVar5;
}
_fts_close(pFVar3);
piVar5 = ___error();
*piVar5 = iVar7;
piVar5 = ___error();
if (*piVar5 == 0) {
return piVar5;
}
}
FUN_1000071f8();
/* WARNING: Treating indirect jump as call */
UNRECOVERED_JUMPTABLE_00 = (code *)UndefinedInstructionException(0xc,0x100004a88);
piVar5 = (int *)(*UNRECOVERED_JUMPTABLE_00)();
return piVar5;
}
Some other functions that might be worth looking at:
- 0x100003824
- 0x100004b3c
Hello @edmcman,
Thank you for reporting the issue. I did some preliminary analysis to understand why we see fewer variables renamed when "Suggest new variable names (source-like only)" is enabled. It turns out that Ghidra 11 can map a lot more variable names from DWARF in the decompilation compared to Ghidra 10.4. Since the VarBERT Ghidra models were trained on a data set generated using Ghidra 10.4, they predicted a lot more variables to be decompiler-generated, which, according to Ghidra 11, are now source code-generated.
We plan to train new models compatible with Ghidra 11 for improved performance.
That's interesting, thanks for the update!