I have recently found myself employed in a part-time position as a developer/admin for our University's Open Source lab. One of my tasks was to write a script to analyse the NFS-mounted home directories of our 3000+ users. This info is going to be used to design a daily/weekly automated cleanup policy, which isn't my job.

Anyways, here's the code for v0.1.1 (I fixed the sorting problem v0.1 had). It'll get cleaner and easier to use, but for now it does the job and I can move on to tasks with a higher priority than "fiddle and prettify".


# Some constants:
inFileName = "duhomes.txt"  # Where we're reading the data from.
                            # This should be output from 'du'.
basePath = "/home/export/"  # Anything at the beginning of the path to strip.
                            # There resulting path must not have a leading
                            # '/', as this would confuse the tree generator.

# Display parameters:
sortBy = 'size'
sortOrder = 'descending'
minsize = 1024
mincount = 10
size = 102400
count = 25
depth = 2

# Grab the file structure into something a little easier to work with.
# A tree structure works well, and a dictionary is well suited to building
# one.

def buildTree(fileName):
    duTree = {'children': {}}
    inFile = file(inFileName)

    def addNode(size, path):
        if path == "": # Special case: top of tree
            duTree['size'] = size
        dirList = path.split("/")
        currentNode = duTree
        while len(dirList) > 0:
            currentDir = dirList.pop(0)
            if currentDir not in currentNode['children']:
                currentNode['children'][currentDir] = {'children': {}}
            currentNode = currentNode['children'][currentDir]
        currentNode['size'] = size

    for line in inFile:
        splitLine = line.split()
        dir = splitLine[1].replace(basePath, "")
        addNode(int(splitLine[0]), dir)
    return duTree

def walkHomes(tree):
    def recWH(dirName, dirInfo, dirStats):
        if dirName not in dirStats['children']:
            dirStats['children'][dirName] = {'count': 0, 'size': 0,
                                                            'children': {}}
        currentDir = dirStats['children'][dirName]
        currentDir['count'] += 1
        currentDir['size'] += dirInfo['size']
        for sdName, sdStuff in dirInfo['children'].items():
            recWH(sdName, sdStuff, currentDir)

    dirStats = {'children': {}}
    for homedir in tree['children'].values():
        for dirName, dirInfo in homedir['children'].items():
            recWH(dirName, dirInfo, dirStats)
    return dirStats

def displayDirTree(dirTree, depth=None, sortBy='size', sortOrder='descending'):
    def sortDict(adict, sortBy='size', sortOrder='decending'):
        tempList = [(v[sortBy], k, v) for k,v in adict.items()]
        if sortOrder != 'ascending': tempList.reverse()
        return [(v[1], v[2]) for v in tempList]

    def recDDT(level, name, subtree):
        print "%s%s (%s, %s)" % ("    "*level, name, subtree['count'],
        if recDDT.depth != None:
            if level >= recDDT.depth:
        for dirName, data in sortDict(subtree['children'], recDDT.sortBy,
            recDDT(level+1, dirName, data)

    recDDT.depth = depth
    recDDT.sortBy = sortBy
    recDDT.sortOrder = sortOrder
    for dirName, data in sortDict(dirTree['children'], sortBy, sortOrder):
        recDDT(0, dirName, data)

def filterOutput(dirTree, mincount=1, minsize=0, count=20, size=1024):
    def recFO(dirName, dirInfo, dirSubtree):
        lump = False
        if dirInfo['size'] < recFO.minsize and dirInfo['count'] < recFO.mincount:
            lump = True
        if dirInfo['size'] < recFO.size or dirInfo['count'] < recFO.count:
            lump = True
        if lump:
            if '*others' not in dirSubtree['children']:
                dirSubtree['children']['*others'] = {'count': 0, 'size': 0,
                                                            'children': {}}
            dirSubtree['children']['*others']['size'] += dirInfo['size']
            dirSubtree['children']['*others']['count'] += dirInfo['count']
            dirSubtree['children'][dirName] = {'count': dirInfo['count'],
                                    'size': dirInfo['size'], 'children': {}}
            for name, info in dirInfo['children'].items():
                recFO(name, info, dirSubtree['children'][dirName])

    filteredTree = {'children': {}}
    recFO.mincount = mincount
    recFO.minsize = minsize
    recFO.count = count
    recFO.size = size
    for name, info in dirTree['children'].items():
        recFO(name, info, filteredTree)
    return filteredTree

duTree = buildTree(inFileName)
dirTree = walkHomes(duTree)
filteredTree = filterOutput(dirTree, mincount=mincount, minsize=minsize,
                                                    count=count, size=size)
displayDirTree(filteredTree, depth=depth, sortBy=sortBy)