Skip to content

Commit

Permalink
add git find-reviewers tool
Browse files Browse the repository at this point in the history
this makes sense as an essential part of of the "git workflow", so it
behoves to move it into these tools and out of the webapp specific
version.
  • Loading branch information
Matthew Rothenberg authored and mroth committed May 26, 2015
1 parent 3de109d commit 7628617
Show file tree
Hide file tree
Showing 2 changed files with 289 additions and 4 deletions.
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
# khan academy git-workflow

Collection of scripts in order to enable the [git workflow][git-at-ka]
> Collection of scripts used to enable the [git workflow][git-at-ka]
at Khan Academy. (see also: [arcanist])

[git-at-ka]: https://khanacademy.org/r/git-at-ka
[arcanist]: https://github.com/khan/arcanist

## Tools

#### git deploy-branch
Creates a remote deploy branch for use with GitHub-style deploys.
Creates a remote _deploy branch_ for use with GitHub-style deploys.

For GitHub-style deploys, all work must branch off a deploy branch.

#### git review-branch
Creates a new local branch ensuring that it's not based off master.
Such a branch is called a 'review branch'.
Such a branch is called a _review branch_.

All review branches should be based off a deploy branch.

#### git recursive-grep
Runs git grep recursively through submodules, showing file paths
relative to cwd.

#### git find-reviewers
Find the best reviewer(s) for a given changeset. The idea is that if one user
has modified all the lines you are editing, they are a good candidate to review
your change.
281 changes: 281 additions & 0 deletions bin/git-find-reviewers
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
#!/usr/bin/env python

"""Find the best reviewer(s) for a given changeset.
This works under both git and mercurial.
1) Runs 'hg diff <argv>' (so by default, diffs all currently edited files).
or 'git diff <argv>'.
2) Analyzes the diff to find out exact lines that have changed.
3) Runs 'hg/git blame' to figure out who last modified those lines.
4) For each file, prints the usernames who last-modified any of the
diffed lines in the file, along with how many of the lines they
modified.
The idea is that if one user has modified all the lines you are
editing, they are a good candidate to review your change.
"""

import os
import re
import sys
import subprocess


# The line we care about in the diff output is
# @@ -<startline>,<numlines> ...
# or @@ -<startline> ... # in which <numlines> is taken to be 1
# (Everything else is header or diff content, which we ignore.)
_DIFFLINE_RE = re.compile('^@@ -(\d+)(?:,(\d+))? ')

_NEWFILE_RE = re.compile('^--- (.*)')


class Mercurial(object):
def __init__(self, ui, repo):
self.ui = ui
self.repo = repo

def write(self, msg):
self.ui.write(msg)

def find_wholefile_lines(self, files, revision='.'):
"""Return a map from abspath -> set-of-all-linumbers in the file."""
ctx = self.repo[revision] # state of repository base revision
m = ctx.match(files, None, None, 'relpath')
all_files = ctx.walk(m)

all_lines = {}
for abspath in all_files:
before_text = ctx.filectx(abspath).data()
num_lines = before_text.count('\n')
all_lines[abspath] = set(range(1, num_lines + 1))

return all_lines

def find_modified_lines(self, files, revision='.'):
"""Return a map from abspath -> set-of-linenumbers changed."""
import mercurial.mdiff

ctx = self.repo[revision] # state of repository base revision
edit_ctx = self.repo[None] # current working state ('.' + local edits)

# Find the files that have modifications.
m = ctx.match(files, None, None, 'relpath')
# Only count files that have been edited from tip.
modified = self.repo.status(ctx, None, match=m)[0]

modified_lines = {}
diffopts = mercurial.mdiff.diffopts(context=0, nodates=True)
for abspath in modified:
before_text = ctx.filectx(abspath).data()
after_text = edit_ctx.filectx(abspath).data()
diff_text = mercurial.mdiff.unidiff(
before_text, None, after_text, None,
abspath, abspath, opts=diffopts)
# Look at the '@@ -<startline>,<numlines> ...' diffline to
# find what lines in the input file were changed.
modified_lines.setdefault(abspath, set())
for line in diff_text.splitlines():
m = _DIFFLINE_RE.match(line)
if m:
startline, n = int(m.group(1)), int(m.group(2) or '1')
modified_lines[abspath].update(range(startline,
startline + n))

return modified_lines

def get_annotation_info(self, abspaths, revision='.'):
"""Return a map abspath -> list-of-author-names.
retval[filename][i] says who wrote the i-th line of the file.
Line numbers start at 1, so retval[filename][0] is always None.
"""
retval = {}
user_to_shortuser = {}

ctx = self.repo[revision] # state of repository base revision
for abspath in abspaths:
retval[abspath] = [None]
anno_lines = ctx[abspath].annotate(follow=True)
for anno_line in anno_lines:
modifier = anno_line[0].user()
if modifier not in user_to_shortuser:
user_to_shortuser[modifier] = self.ui.shortuser(modifier)
retval[abspath].append(user_to_shortuser[modifier])

return retval


class Git(object):
def write(self, msg):
sys.stdout.write(msg)

def find_wholefile_lines(self, files, revision='HEAD'):
"""Return a map from abspath -> set-of-all-linumbers in the file."""
all_lines = {}
for f in files:
before_text = subprocess.check_output(['git', 'show',
'%s:%s' % (revision, f)])
num_lines = before_text.count('\n')
all_lines[os.path.abspath(f)] = set(range(1, num_lines + 1))
return all_lines

def find_modified_lines(self, files, revision='HEAD'):
"""Return a map from abspath -> set-of-linenumbers changed."""
modified_lines = {}

# Only count Deleted and Modified files.
diff_output = subprocess.check_output(['git', 'diff', '-U0',
'--diff-filter=DM',
'--no-ext-diff', '--no-prefix',
revision, '--'] + files)
abspath = None
for line in diff_output.splitlines():
m = _NEWFILE_RE.match(line)
if m:
abspath = os.path.abspath(m.group(1))
modified_lines[abspath] = set()
else:
m = _DIFFLINE_RE.match(line)
if m:
assert abspath, line # filename comes before diff info
startline, n = int(m.group(1)), int(m.group(2) or '1')
modified_lines[abspath].update(range(startline,
startline + n))

return modified_lines

def get_annotation_info(self, abspaths, revision='HEAD'):
"""Return a map abspath -> list-of-author-nqames.
retval[filename][i] says who wrote the i-th line of the file.
Line numbers start at 1, so retval[filename][0] is always None.
"""
retval = {}
author_re = re.compile(r'author-mail <([^>]*)>')

for abspath in abspaths:
retval[abspath] = [None]
# TODO(csilvers): also specify -C?
blame_output = subprocess.check_output(['git', 'blame', '-M',
'--line-porcelain',
revision, '--', abspath])
for line in blame_output.splitlines():
m = author_re.match(line)
if m:
author = m.group(1)
# Just to make the common-case output prettier.
if author.endswith('@khanacademy.org'):
author = author[:-len('@khanacademy.org')]
retval[abspath].append(author)

return retval


def findreviewers(vcs, files, revision=None, num_reviewers=3,
whole_file=False, output_per_file=False, ignore=[]):
"""Find the best reviewer(s) for a given changeset.
Examines the current changes in this file, and runs 'hg blame' or
'git blame' (depending on 'vcs') to find who last edited those
same lines. Collates and returns this information, including how
many lines each person is responsible for.
Arguments:
vcs: either a Mercurial or a Git instance, from this file.
files: a list of filenames to find reviewer information for
revision: what revision to diff against when looking for
reviewers (typically '.' for mercurial or 'HEAD' for git).
num_reviewers: the number of reviewers to suggest for each file.
3 is a reasonable value.
whole_file: if True, return reviewer information for the input
files as a whole, not just for the diff vs 'revision'. This
is useful when you want to know who is 'most responsible' for
a file.
output_per_file: if True, instead of printing the best reviewers
for the set of input files as a whole, prints a separate list
of best reviewers for each file in the input.
ignore: a set/list of revisions to ignore when finding blame info.
TODO(csilvers): implement this.
"""
# revision has to be a kwarg because of how findreviewers() is
# called, but it's actually required.
assert revision, 'revision argument cannot be None!'

if whole_file:
modified_lines = vcs.find_wholefile_lines(files, revision)
else:
modified_lines = vcs.find_modified_lines(files, revision)

annotation_info = vcs.get_annotation_info(modified_lines.keys(), revision)

if output_per_file:
# filename -> {author: num_lines, ...}
num_lines_per_author = {abspath: {} for abspath in modified_lines}
else:
# None -> {author: num_lines, ...}
num_lines_per_author = {None: {}}

for abspath in modified_lines:
for linenum in modified_lines[abspath]:
author = annotation_info[abspath][linenum]
if output_per_file:
num_lines_per_author[abspath].setdefault(author, 0)
num_lines_per_author[abspath][author] += 1
else:
# Just store global info
num_lines_per_author[None].setdefault(author, 0)
num_lines_per_author[None][author] += 1

# Print the information out.
for abspath in sorted(num_lines_per_author.iterkeys()):
if abspath:
vcs.write('\n--- %s\n' % abspath)

reviewers = num_lines_per_author[abspath].items()
reviewers.sort(key=lambda (_, num_lines): num_lines, reverse=True)
total_lines = sum(num_lines_per_author[abspath].itervalues())

for (reviewer, reviewer_num_lines) in reviewers[:num_reviewers]:
vcs.write('%s: %s lines (%.1f%%)\n'
% (reviewer, reviewer_num_lines,
reviewer_num_lines * 100.0 / total_lines))


# How hg uses this script: via the cmdtable hook.
cmdtable = {
'findreviewers':
(lambda ui, repo, *files, **opts: (
findreviewers(Mercurial(ui, repo), files, **opts)),
[('f', 'output-per-file', None, 'Print results per input file'),
('n', 'num-reviewers', 3, 'How many reviewers to show'),
('w', 'whole-file', None, 'Calculate reviewers based on entire file'),
('i', 'ignore', [], 'TODO: Revisions to ignore when annotating'),
('r', 'revision', '.', 'Revision to use as base'),
],
'[-f] [-n #] [-w] [-i <commit_id> ...] [FILE...]')
}


# How git uses this script: via __main__
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument('files', nargs='*')
for (shortname, longname, default, help) in cmdtable['findreviewers'][1]:
if default is None:
parser.add_argument('--%s' % longname, '-%s' % shortname,
help=help, default=default,
action='store_true')
else:
parser.add_argument('--%s' % longname, '-%s' % shortname,
help=help, default=default,
type=type(default))
# The one place we differ from the mercurial defaults.
parser.set_defaults(revision='HEAD')

args = parser.parse_args()

findreviewers(Git(), **args.__dict__)

0 comments on commit 7628617

Please sign in to comment.