Parse a File

#!/usr/bin/env python
#
# [SNIPPET_NAME: Parse a File]
# [SNIPPET_CATEGORIES: os, Regular Expression]
# [SNIPPET_DESCRIPTION: Recursively go through all the python files and see who has submitted the most (assuming they put their name on them)]
# [SNIPPET_AUTHOR: Andy Breiner <[email protected]>]
# [SNIPPET_LICENSE: GPL]
# [SNIPPET_DOCS: http://docs.python.org/library/os.html, # http://docs.python.org/library/re.html]

import os
import re

# A dictionary to store information about who contributed what
# it is assumed that the file has the format
# SNIPPET_AUTHOR: USER_NAME <USER_EMAIL>], if this is not the case then
# this program doesn't count that file
name_list = {}
count_list = {}


def find_python_files(directory):
    """Loop through the current directory and look for a python file, if
       one is found look for the author and create a list showing who has
       contributed the most snippets"""

    # Loop over all the items in the directory
    for f in os.listdir(directory):
        # If the item is a file, check to see if it ends with .py and if
        # so, parse it and find the Author
        if os.path.isfile(os.path.join(directory, f)):
            # this will get the last three characters of the filename
            if f[-3:] == ".py":

                # this will open the file
                file_handler = open(os.path.join(directory, f), "r")

                # this will read all of the file content
                content = file_handler.read()

                # this will close the file
                file_handler.close()

                # we are looking for a string that has "SNIPPET_AUTHOR:"
                # and then any character after it (.) one or more times (+)
                # the newline will end this search
                found = re.search('SNIPPET_AUTHOR:.+', content)
                try:
                    # found.group(0) contains the first and should be only
                    # occurance of this regular expression
                    line = found.group(0)
                    # we do a regular search on the line looking for
                    # the email which should be surrounded by < >
                    email_search = re.search('<.+>', line)

                    # email_search.group(0) has the email, the [1:-1]
                    # tells python to ignore the first character (<)
                    # and the last character (>)
                    email = email_search.group(0)[1:-1]

                    # we look between the : and < which should be the
                    # name
                    name_search = re.search(':.*<', line)

                    # name_search.group(0) has the name, the [1:-1]
                    # tells python to ignore the first character (:)
                    # and the last character (<), finally strip()
                    # removes any extra spaces at the begin or end
                    name = name_search.group(0)[1:-1].strip()

                    # see if this author has already been encountered
                    # before
                    try:
                        count_list[email] = count_list[email] + 1
                    except:
                        # they have not been seen before so add them
                        count_list[email] = 1
                        name_list[email] = name
                except:
                    # something was wrong with finding a regular
                    # expression
                    pass

        # If the item is a directory, we recursivley look through that
        # directory
        if os.path.isdir(os.path.join(directory, f)):
            find_python_files(os.path.join(directory, f))

# go up one level and start searching for python files
location = os.path.join(os.getcwd(), "../")

# Recurse through that location and parse all the python files to see who
# has contributed the most
find_python_files(location)

# sort the collection
count_list = sorted(count_list.items(), key=lambda(key, value): (value, key))

# print the count_list in reverse order while looking inside name_list for
# the given name
count_list.reverse()
for index in range(0, len(count_list)):
    # get the first entry which is similar to (email, count)
    # then get the first item in that entry
    email = count_list[index][0]

    # use the email to lookup the name and then print the name along with
    # the second item in the entry which should be the python snippet count
    print name_list[email] + " " + str(count_list[index][1])