varbert_api icon indicating copy to clipboard operation
varbert_api copied to clipboard

Ghidra model producing less variables than expected in the wild

Open mahaloz opened this issue 2 years ago • 5 comments

In a discussion with @edmcman here: https://github.com/mahaloz/DAILA/pull/32#issuecomment-1961793064

We have both independently noticed VarBERT is producing fewer variables than expected on binaries such as /bin/ls. It's unclear now which of the following (combinations?) is the cause:

  1. A bug in how we are passing data to VarBERT
  2. A bad choice of model to use at-scale
  3. An incorrect model

We should investigate this to make the real-world use better. It may also be worth looking at our Ghidra testcases again.

mahaloz avatar Feb 23 '24 18:02 mahaloz

I can upload specific examples if it is helpful.

edmcman avatar Feb 23 '24 19:02 edmcman

@edmcman that would be helpful, yes

mahaloz avatar Feb 23 '24 20:02 mahaloz

Here's zip containing a Ghidra gzf: file.zip

Here is the source code of 0x1000047b4 after using DAILA to rename variables:
int * FUN_1000047b4(int level,char **argc,uint flags)

{
  int iVar1;
  code *UNRECOVERED_JUMPTABLE_00;
  uint uVar2;
  FTS *pFVar3;
  FTSENT *pFVar4;
  int *piVar5;
  FTSENT *pFVar6;
  int iVar7;
  ulong unaff_x30;
  FTSENT *r;
  
  UNRECOVERED_JUMPTABLE_00 = (code *)0x0;
  if (DAT_10000c0d8 == '\0') {
    UNRECOVERED_JUMPTABLE_00 = FUN_100004abc;
  }
  pFVar3 = _fts_open(argc,flags,(int *)UNRECOVERED_JUMPTABLE_00);
  if (pFVar3 == (FTS *)0x0) {
    FUN_1000071dc();
  }
  else {
    pFVar4 = _fts_children(pFVar3,0);
    if (pFVar4 != (FTSENT *)0x0) {
      FUN_100004b3c(0,pFVar4);
    }
    if (DAT_10000c0f8 == '\x01') {
      if (((unaff_x30 ^ unaff_x30 << 1) >> 0x3e & 1) != 0) {
                    /* WARNING: Treating indirect jump as call */
        UNRECOVERED_JUMPTABLE_00 = (code *)SoftwareBreakpoint(0xc471,0x100004860);
        piVar5 = (int *)(*UNRECOVERED_JUMPTABLE_00)();
        return piVar5;
      }
      uVar2 = _fts_close(pFVar3);
      return (int *)(ulong)uVar2;
    }
    iVar7 = 0;
    if (((flags & 8) == 0 | DAT_10000c0f0 & 1) == 0) {
      iVar7 = 0x100;
    }
    piVar5 = ___error();
    *piVar5 = 0;
    pFVar4 = _fts_read(pFVar3);
    while (pFVar4 != (FTSENT *)0x0) {
      if (pFVar4->fts_info - 1 < 0xd) {
        switch((uint)pFVar4->fts_info) {
        default:
          if ((((pFVar4->fts_level == 0) || (pFVar4->fts_name[0] != '.')) ||
              ((DAT_10000c0dc & 1) != 0)) &&
             (((DAT_10000c088 == 0 || (pFVar4->fts_statp == (stat *)0x0)) ||
              ((*(byte *)((long)&pFVar4->fts_statp->st_flags + 3) >> 6 & 1) == 0)))) {
            if (DAT_10000c11c == '\x01') {
              _putchar(10);
              FUN_1000055f8(pFVar4->fts_path);
              _puts(":");
            }
            else if (1 < level) {
              FUN_1000055f8(pFVar4->fts_path);
              _puts(":");
              DAT_10000c11c = '\x01';
            }
            pFVar6 = _fts_children(pFVar3,iVar7);
            if (((flags >> 1 & 1) != 0) && (r = pFVar6, DAT_10000c024 != '\0')) {
              for (; r != (FTSENT *)0x0; r = r->fts_link) {
                if (r->fts_info == 0xd) {
                  r->fts_number = 1;
                }
              }
            }
            FUN_100004b3c(pFVar4,pFVar6);
            if (DAT_10000c0f0 == 1 || pFVar6 == (FTSENT *)0x0) break;
          }
          _fts_set(pFVar3,pFVar4,4);
          break;
        case 2:
          FUN_100007214(pFVar4);
          break;
        case 3:
        case 5:
        case 8:
        case 9:
        case 10:
        case 0xb:
        case 0xc:
          break;
        case 4:
        case 7:
          FUN_100007254(pFVar4);
          break;
        case 6:
          iVar1 = pFVar4->fts_errno;
          if (iVar1 != 0) {
            piVar5 = ___error();
            *piVar5 = iVar1;
            goto LAB_100004a28;
          }
          break;
        case 0xd:
          if (((flags >> 1 & 1) != 0) && (DAT_10000c024 != '\0')) {
            FUN_100007298(pFVar4);
          }
        }
      }
      piVar5 = ___error();
      *piVar5 = 0;
      pFVar4 = _fts_read(pFVar3);
    }
LAB_100004a28:
    piVar5 = ___error();
    if (*piVar5 == 0x51) {
      iVar7 = 1;
    }
    else {
      piVar5 = ___error();
      iVar7 = *piVar5;
    }
    _fts_close(pFVar3);
    piVar5 = ___error();
    *piVar5 = iVar7;
    piVar5 = ___error();
    if (*piVar5 == 0) {
      return piVar5;
    }
  }
  FUN_1000071f8();
                    /* WARNING: Treating indirect jump as call */
  UNRECOVERED_JUMPTABLE_00 = (code *)UndefinedInstructionException(0xc,0x100004a88);
  piVar5 = (int *)(*UNRECOVERED_JUMPTABLE_00)();
  return piVar5;
}

Some other functions that might be worth looking at:

  • 0x100003824
  • 0x100004b3c

edmcman avatar Feb 23 '24 21:02 edmcman

Hello @edmcman,

Thank you for reporting the issue. I did some preliminary analysis to understand why we see fewer variables renamed when "Suggest new variable names (source-like only)" is enabled. It turns out that Ghidra 11 can map a lot more variable names from DWARF in the decompilation compared to Ghidra 10.4. Since the VarBERT Ghidra models were trained on a data set generated using Ghidra 10.4, they predicted a lot more variables to be decompiler-generated, which, according to Ghidra 11, are now source code-generated.

We plan to train new models compatible with Ghidra 11 for improved performance.

Atipriya avatar Feb 27 '24 23:02 Atipriya

That's interesting, thanks for the update!

edmcman avatar Feb 28 '24 14:02 edmcman