scripts icon indicating copy to clipboard operation
scripts copied to clipboard

Ported to Ubuntu (Linux Mint Cinnamon)

Open roddyongithub opened this issue 2 years ago • 1 comments

Hello Tomer, I have started porting the find-dupes script to Ubuntu. Completely changed the parsing, and some minor changes. It's working - yay! But I'm a complete newbie to awk. I would love to add some functionality such as sorting by size and printing the size within the report, but sorting via asort and asorti seems to mess up the arrays - while arrays are my hugest problem, for example I have no clue what this line is doing: file_size[$5, ++file_size[$5, "length"]] = dir "/" fname Can you please support? I'd love to cleanup my 9TB of data ;-) And what is the best way to publish my Ubuntu port? Thanks a lot. Yes, it's still a mess ...

# Call like this, while in the folder to scan
# shopt -s globstar && ls -aldp ./**/* | grep -v /$ 
#  | awk '{print $1,"\t",$2,"\t",$3,"\t",$4,"\t",$5,"\t",$6,"\t",$7,"\t",$8,"\t",$9,$10,$11,$12,$13,$14,$15;}' 
#  | awk -F $'\t' -f [path]/find-dupes.awk

BEGIN {
    OFS = "\t"
    md5_exec = "md5sum"
}

## Parse tab separated input
NF {
    gsub(/^[ ]+|[ ]+$/, "", $9)                              # remove tr/lea spaces
    n = split($9, a, "/"); fname = a[n];                     # get name of file
    dir = substr($9, 1, length($9)-length(fname)-1)          # get name of folder
    file_size[$5, ++file_size[$5, "length"]] = dir "/" fname # array file & size
    if(file_size[$5, "length"] > 1 && $5 > 35)               # when duplicate found
        sizes[$5]                                            # create size in sizes
}

END {
    ## Find the files that have identical sizes, and then get their MD5 hash:
    for(size in sizes)
        for(i=1; i<=file_size[size, "length"]; i++) {
            file = file_size[size, i]
                FS= " = "
                (md5_exec " '" file "'") | getline
                    split($1, a, " "); hash = a[1];
                    print hash " -" size "bytes: " file
                    file_hash[hash, ++file_hash[hash, "length"]] = file
                    if (file_hash[hash, "length"] > 1)
                        hashes[hash]
        }

    ## Report files that have identical MD5 hashes:                  
    print "\n#### Duplicates ###"
    for(hash in hashes) {
        print "MD5 " hash ":"
        for(i=1; i<=file_hash[hash, "length"]; i++)
            print OFS file_hash[hash, i]
    }
}

roddyongithub avatar Jul 01 '23 12:07 roddyongithub

And why do you have if(file_size[$5, "length"] > 1 && $5 > 35) ? Do you have a lot of small files you want to exclude?

roddyongithub avatar Jul 01 '23 12:07 roddyongithub