update_copyright.py

   1 #!/usr/bin/python
   2 #
   3 # Copyright
   4
   5 """Automatically update copyright boilerplate.
   6
   7 This script is adapted from one written for `Bugs Everywhere`_.
   8
   9 .. _Bugs Everywhere: http://bugseverywhere.org/
  10 """
  11
  12 import difflib
  13 import email.utils
  14 import os
  15 import os.path
  16 import re
  17 import sys
  18 import time
  19
  20
  21 PROJECT_INFO = {
  22     'project': 'Cookbook',
  23     'vcs': 'Git',
  24     }
  25
  26 # Break "copyright" into "copy" and "right" to avoid matching the
  27 # REGEXP.
  28 COPY_RIGHT_TEXT="""
  29 This file is part of %(project)s.
  30
  31 %(project)s is free software: you can redistribute it and/or modify it
  32 under the terms of the GNU General Public License as published by the
  33 Free Software Foundation, either version 3 of the License, or (at your
  34 option) any later version.
  35
  36 %(project)s is distributed in the hope that it will be useful,
  37 but WITHOUT ANY WARRANTY; without even the implied warranty of
  38 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  39 GNU General Public License for more details.
  40
  41 You should have received a copy of the GNU General Public License
  42 along with %(project)s.  If not, see <http://www.gnu.org/licenses/>.
  43 """.strip()
  44
  45 COPY_RIGHT_TAG='-xyz-COPY' + '-RIGHT-zyx-' # unlikely to occur in the wild :p
  46
  47 # Convert author names to canonical forms.
  48 # ALIASES[<canonical name>] = <list of aliases>
  49 # for example,
  50 # ALIASES = {
  51 #     'John Doe <jdoe@a.com>':
  52 #         ['John Doe', 'jdoe', 'J. Doe <j@doe.net>'],
  53 #     }
  54 # Git-based projects are encouraged to use .mailmap instead of
  55 # ALIASES.  See git-shortlog(1) for details.
  56 ALIASES = {}
  57
  58 # List of paths that should not be scanned for copyright updates.
  59 # IGNORED_PATHS = ['./.git/']
  60 IGNORED_PATHS = ['./.git/', './recipe/', './cookbook/static/',
  61                  './cookbook/template/']
  62 # List of files that should not be scanned for copyright updates.
  63 # IGNORED_FILES = ['COPYING']
  64 IGNORED_FILES = ['COPYING']
  65
  66 # Work around missing author holes in the VCS history.
  67 # AUTHOR_HACKS[<path tuple>] = [<missing authors]
  68 # for example, if John Doe contributed to module.py but wasn't listed
  69 # in the VCS history of that file:
  70 # AUTHOR_HACKS = {
  71 #     ('path', 'to', 'module.py'):['John Doe'],
  72 #     }
  73 AUTHOR_HACKS = {}
  74
  75 # Work around missing year holes in the VCS history.
  76 # YEAR_HACKS[<path tuple>] = <original year>
  77 # for example, if module.py was published in 2008 but the VCS history
  78 # only goes back to 2010:
  79 # YEAR_HACKS = {
  80 #     ('path', 'to', 'module.py'):2008,
  81 #     }
  82 YEAR_HACKS = {}
  83
  84 # Helpers for VCS-specific commands
  85
  86 def splitpath(path):
  87     """Recursively split a path into elements.
  88
  89     Examples
  90     --------
  91
  92     >>> splitpath(os.path.join('a', 'b', 'c'))
  93     ('a', 'b', 'c')
  94     >>> splitpath(os.path.join('.', 'a', 'b', 'c'))
  95     ('a', 'b', 'c')
  96     """
  97     path = os.path.normpath(path)
  98     elements = []
  99     while True:
 100         dirname,basename = os.path.split(path)
 101         elements.insert(0,basename)
 102         if dirname in ['', '.']:
 103             break
 104         path = dirname
 105     return tuple(elements)
 106
 107 # VCS-specific commands
 108
 109 if PROJECT_INFO['vcs'] == 'Git':
 110
 111     import subprocess
 112
 113     _MSWINDOWS = sys.platform == 'win32'
 114     _POSIX = not _MSWINDOWS
 115
 116     def invoke(args, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, expect=(0,)):
 117         """
 118         expect should be a tuple of allowed exit codes.
 119         """
 120         try :
 121             if _POSIX:
 122                 q = subprocess.Popen(args, stdin=subprocess.PIPE,
 123                                      stdout=stdout, stderr=stderr)
 124             else:
 125                 assert _MSWINDOWS == True, 'invalid platform'
 126                 # win32 don't have os.execvp() so run the command in a shell
 127                 q = subprocess.Popen(args, stdin=subprocess.PIPE,
 128                                      stdout=stdout, stderr=stderr, shell=True)
 129         except OSError, e:
 130             raise ValueError([args, e])
 131         stdout,stderr = q.communicate(input=stdin)
 132         status = q.wait()
 133         if status not in expect:
 134             raise ValueError([args, status, stdout, stderr])
 135         return status, stdout, stderr
 136
 137     def git_cmd(*args):
 138         status,stdout,stderr = invoke(['git'] + list(args))
 139         return stdout.rstrip('\n')
 140
 141     def original_year(filename, year_hacks=YEAR_HACKS):
 142         output = git_cmd('log', '--follow',
 143                          '--format=format:%ad',  # Author date
 144                          '--date=short',         # YYYY-MM-DD
 145                          filename)
 146         years = [int(line.split('-', 1)[0]) for line in output.splitlines()]
 147         if splitpath(filename) in year_hacks:
 148             years.append(year_hacks[splitpath(filename)])
 149         years.sort()
 150         return years[0]
 151
 152     def authors(filename, author_hacks=AUTHOR_HACKS):
 153         output = git_cmd('log', '--follow', '--format=format:%aN <%aE>',
 154                          filename)   # Author name <author email>
 155         ret = list(set(output.splitlines()))
 156         if splitpath(filename) in author_hacks:
 157             ret.extend(author_hacks[splitpath(filename)])
 158         return ret
 159
 160     def authors_list(author_hacks=AUTHOR_HACKS):
 161         output = git_cmd('log', '--format=format:%aN <%aE>')
 162         ret = list(set(output.splitlines()))
 163         for path,authors in author_hacks.items():
 164             ret.extend(authors)
 165         return ret
 166
 167     def is_versioned(filename):
 168         output = git_cmd('log', '--follow', filename)
 169         if len(output) == 0:
 170             return False
 171         return True
 172
 173 elif PROJECT_INFO['vcs'] == 'Mercurial':
 174
 175     import StringIO
 176     import mercurial
 177     import mercurial.dispatch
 178
 179     def mercurial_cmd(*args):
 180         cwd = os.getcwd()
 181         stdout = sys.stdout
 182         stderr = sys.stderr
 183         tmp_stdout = StringIO.StringIO()
 184         tmp_stderr = StringIO.StringIO()
 185         sys.stdout = tmp_stdout
 186         sys.stderr = tmp_stderr
 187         try:
 188             mercurial.dispatch.dispatch(list(args))
 189         finally:
 190             os.chdir(cwd)
 191             sys.stdout = stdout
 192             sys.stderr = stderr
 193         return (tmp_stdout.getvalue().rstrip('\n'),
 194                 tmp_stderr.getvalue().rstrip('\n'))
 195
 196     def original_year(filename, year_hacks=YEAR_HACKS):
 197         # shortdate filter: YEAR-MONTH-DAY
 198         output,error = mercurial_cmd('log', '--follow',
 199                                      '--template', '{date|shortdate}\n',
 200                                      filename)
 201         years = [int(line.split('-', 1)[0]) for line in output.splitlines()]
 202         if splitpath(filename) in year_hacks:
 203             years.append(year_hacks[splitpath(filename)])
 204         years.sort()
 205         return years[0]
 206
 207     def authors(filename, author_hacks=AUTHOR_HACKS):
 208         output,error = mercurial_cmd('log', '--follow',
 209                                      '--template', '{author}\n',
 210                                      filename)
 211         ret = list(set(output.splitlines()))
 212         if splitpath(filename) in author_hacks:
 213             ret.extend(author_hacks[splitpath(filename)])
 214         return ret
 215
 216     def authors_list(author_hacks=AUTHOR_HACKS):
 217         output,error = mercurial_cmd('log', '--template', '{author}\n')
 218         ret = list(set(output.splitlines()))
 219         for path,authors in author_hacks.items():
 220             ret.extend(authors)
 221         return ret
 222
 223     def is_versioned(filename):
 224         output,error = mercurial_cmd('log', '--follow', filename)
 225         if len(error) > 0:
 226             return False
 227         return True
 228
 229 elif PROJECT_INFO['vcs'] == 'Bazaar':
 230
 231     import StringIO
 232     import bzrlib
 233     import bzrlib.builtins
 234     import bzrlib.log
 235
 236     class LogFormatter (bzrlib.log.LogFormatter):
 237         supports_merge_revisions = True
 238         preferred_levels = 0
 239         supports_deta = False
 240         supports_tags = False
 241         supports_diff = False
 242
 243         def log_revision(self, revision):
 244             raise NotImplementedError
 245
 246     class YearLogFormatter (LogFormatter):
 247         def log_revision(self, revision):
 248             self.to_file.write(
 249                 time.strftime('%Y', time.gmtime(revision.rev.timestamp))
 250                 +'\n')
 251
 252     class AuthorLogFormatter (LogFormatter):
 253         def log_revision(self, revision):
 254             authors = revision.rev.get_apparent_authors()
 255             self.to_file.write('\n'.join(authors)+'\n')
 256
 257     def original_year(filename, year_hacks=YEAR_HACKS):
 258         cmd = bzrlib.builtins.cmd_log()
 259         cmd.outf = StringIO.StringIO()
 260         cmd.run(file_list=[filename], log_format=YearLogFormatter, levels=0)
 261         years = [int(year) for year in set(cmd.outf.getvalue().splitlines())]
 262         if splitpath(filename) in year_hacks:
 263             years.append(year_hacks[splitpath(filename)])
 264         years.sort()
 265         return years[0]
 266
 267     def authors(filename, author_hacks=AUTHOR_HACKS):
 268         cmd = bzrlib.builtins.cmd_log()
 269         cmd.outf = StringIO.StringIO()
 270         cmd.run(file_list=[filename], log_format=AuthorLogFormatter, levels=0)
 271         ret = list(set(cmd.outf.getvalue().splitlines()))
 272         if splitpath(filename) in author_hacks:
 273             ret.extend(author_hacks[splitpath(filename)])
 274         return ret
 275
 276     def authors_list(author_hacks=AUTHOR_HACKS):
 277         cmd = bzrlib.builtins.cmd_log()
 278         cmd.outf = StringIO.StringIO()
 279         cmd.run(log_format=AuthorLogFormatter, levels=0)
 280         output = cmd.outf.getvalue()
 281         ret = list(set(cmd.outf.getvalue().splitlines()))
 282         for path,authors in author_hacks.items():
 283             ret.extend(authors)
 284         return ret
 285
 286     def is_versioned(filename):
 287         cmd = bzrlib.builtins.cmd_log()
 288         cmd.outf = StringIO.StringIO()
 289         cmd.run(file_list=[filename])
 290         return True
 291
 292 else:
 293     raise NotImplementedError('Unrecognized VCS: %(vcs)s' % PROJECT_INFO)
 294
 295 # General utility commands
 296
 297 def _strip_email(*args):
 298     """Remove email addresses from a series of names.
 299
 300     Examples
 301     --------
 302
 303     >>> _strip_email('J Doe <jdoe@a.com>')
 304     ['J Doe']
 305     >>> _strip_email('J Doe <jdoe@a.com>', 'JJJ Smith <jjjs@a.com>')
 306     ['J Doe', 'JJJ Smith']
 307     """
 308     args = list(args)
 309     for i,arg in enumerate(args):
 310         if arg == None:
 311             continue
 312         author,addr = email.utils.parseaddr(arg)
 313         args[i] = author
 314     return args
 315
 316 def _reverse_aliases(aliases):
 317     """Reverse an `aliases` dict.
 318
 319     Input:   key: canonical name,  value: list of aliases
 320     Output:  key: alias,           value: canonical name
 321
 322     Examples
 323     --------
 324
 325     >>> aliases = {
 326     ...     'J Doe <jdoe@a.com>':['Johnny <jdoe@b.edu>', 'J'],
 327     ...     'JJJ Smith <jjjs@a.com>':['Jingly <jjjs@b.edu>'],
 328     ...     None:['Anonymous <a@a.com>'],
 329     ...     }
 330     >>> r = _reverse_aliases(aliases)
 331     >>> for item in sorted(r.items()):
 332     ...     print item
 333     ('Anonymous <a@a.com>', None)
 334     ('J', 'J Doe <jdoe@a.com>')
 335     ('Jingly <jjjs@b.edu>', 'JJJ Smith <jjjs@a.com>')
 336     ('Johnny <jdoe@b.edu>', 'J Doe <jdoe@a.com>')
 337     """
 338     output = {}
 339     for canonical_name,_aliases in aliases.items():
 340         for alias in _aliases:
 341             output[alias] = canonical_name
 342     return output
 343
 344 def _replace_aliases(authors, with_email=True, aliases=None):
 345     """Consolidate and sort `authors`.
 346
 347     Make the replacements listed in the `aliases` dict (key: canonical
 348     name, value: list of aliases).  If `aliases` is ``None``, default
 349     to ``ALIASES``.
 350
 351     >>> aliases = {
 352     ...     'J Doe <jdoe@a.com>':['Johnny <jdoe@b.edu>'],
 353     ...     'JJJ Smith <jjjs@a.com>':['Jingly <jjjs@b.edu>'],
 354     ...     None:['Anonymous <a@a.com>'],
 355     ...     }
 356     >>> _replace_aliases(['JJJ Smith <jjjs@a.com>', 'Johnny <jdoe@b.edu>',
 357     ...                   'Jingly <jjjs@b.edu>', 'Anonymous <a@a.com>'],
 358     ...                  with_email=True, aliases=aliases)
 359     ['J Doe <jdoe@a.com>', 'JJJ Smith <jjjs@a.com>']
 360     >>> _replace_aliases(['JJJ Smith', 'Johnny', 'Jingly', 'Anonymous'],
 361     ...                  with_email=False, aliases=aliases)
 362     ['J Doe', 'JJJ Smith']
 363     >>> _replace_aliases(['JJJ Smith <jjjs@a.com>', 'Johnny <jdoe@b.edu>',
 364     ...                   'Jingly <jjjs@b.edu>', 'J Doe <jdoe@a.com>'],
 365     ...                  with_email=True, aliases=aliases)
 366     ['J Doe <jdoe@a.com>', 'JJJ Smith <jjjs@a.com>']
 367     """
 368     if aliases == None:
 369         aliases = ALIASES
 370     if with_email == False:
 371         aliases = dict([(_strip_email(author)[0], _strip_email(*_aliases))
 372                         for author,_aliases in aliases.items()])
 373     rev_aliases = _reverse_aliases(aliases)
 374     for i,author in enumerate(authors):
 375         if author in rev_aliases:
 376             authors[i] = rev_aliases[author]
 377     authors = sorted(list(set(authors)))
 378     if None in authors:
 379         authors.remove(None)
 380     return authors
 381
 382 def _copyright_string(original_year, final_year, authors, prefix=''):
 383     """
 384     >>> print _copyright_string(original_year=2005,
 385     ...                         final_year=2005,
 386     ...                         authors=['A <a@a.com>', 'B <b@b.edu>'],
 387     ...                         prefix='# '
 388     ...                        ) # doctest: +ELLIPSIS
 389     # Copyright (C) 2005 A <a@a.com>
 390     #                    B <b@b.edu>
 391     #
 392     # This file...
 393     >>> print _copyright_string(original_year=2005,
 394     ...                         final_year=2009,
 395     ...                         authors=['A <a@a.com>', 'B <b@b.edu>']
 396     ...                        ) # doctest: +ELLIPSIS
 397     Copyright (C) 2005-2009 A <a@a.com>
 398                             B <b@b.edu>
 399     <BLANKLINE>
 400     This file...
 401     """
 402     if original_year == final_year:
 403         date_range = '%s' % original_year
 404     else:
 405         date_range = '%s-%s' % (original_year, final_year)
 406     lines = ['Copyright (C) %s %s' % (date_range, authors[0])]
 407     for author in authors[1:]:
 408         lines.append(' '*(len('Copyright (C) ')+len(date_range)+1) +
 409                      author)
 410     lines.append('')
 411     lines.extend((COPY_RIGHT_TEXT % PROJECT_INFO).splitlines())
 412     for i,line in enumerate(lines):
 413         lines[i] = (prefix + line).rstrip()
 414     return '\n'.join(lines)
 415
 416 def _tag_copyright(contents):
 417     """
 418     >>> contents = '''Some file
 419     ... bla bla
 420     ... # Copyright (copyright begins)
 421     ... # (copyright continues)
 422     ... # bla bla bla
 423     ... (copyright ends)
 424     ... bla bla bla
 425     ... '''
 426     >>> print _tag_copyright(contents).replace('COPY-RIGHT', 'CR')
 427     Some file
 428     bla bla
 429     -xyz-CR-zyx-
 430     (copyright ends)
 431     bla bla bla
 432     <BLANKLINE>
 433     """
 434     lines = []
 435     incopy = False
 436     for line in contents.splitlines():
 437         if incopy == False and line.startswith('# Copyright'):
 438             incopy = True
 439             lines.append(COPY_RIGHT_TAG)
 440         elif incopy == True and not line.startswith('#'):
 441             incopy = False
 442         if incopy == False:
 443             lines.append(line.rstrip('\n'))
 444     return '\n'.join(lines)+'\n'
 445
 446 def _update_copyright(contents, original_year, authors):
 447     """
 448     >>> contents = '''Some file
 449     ... bla bla
 450     ... # Copyright (copyright begins)
 451     ... # (copyright continues)
 452     ... # bla bla bla
 453     ... (copyright ends)
 454     ... bla bla bla
 455     ... '''
 456     >>> print _update_copyright(contents, 2008, ['Jack', 'Jill']
 457     ...     ) # doctest: +ELLIPSIS, +REPORT_UDIFF
 458     Some file
 459     bla bla
 460     # Copyright (C) 2008-... Jack
 461     #                         Jill
 462     #
 463     # This file...
 464     (copyright ends)
 465     bla bla bla
 466     <BLANKLINE>
 467     """
 468     current_year = time.gmtime()[0]
 469     copyright_string = _copyright_string(
 470         original_year, current_year, authors, prefix='# ')
 471     contents = _tag_copyright(contents)
 472     return contents.replace(COPY_RIGHT_TAG, copyright_string)
 473
 474 def ignored_file(filename, ignored_paths=None, ignored_files=None,
 475                  check_disk=True, check_vcs=True):
 476     """
 477     >>> ignored_paths = ['./a/', './b/']
 478     >>> ignored_files = ['x', 'y']
 479     >>> ignored_file('./a/z', ignored_paths, ignored_files, False, False)
 480     True
 481     >>> ignored_file('./ab/z', ignored_paths, ignored_files, False, False)
 482     False
 483     >>> ignored_file('./ab/x', ignored_paths, ignored_files, False, False)
 484     True
 485     >>> ignored_file('./ab/xy', ignored_paths, ignored_files, False, False)
 486     False
 487     >>> ignored_file('./z', ignored_paths, ignored_files, False, False)
 488     False
 489     """
 490     if ignored_paths == None:
 491         ignored_paths = IGNORED_PATHS
 492     if ignored_files == None:
 493         ignored_files = IGNORED_FILES
 494     if check_disk == True and os.path.isfile(filename) == False:
 495         return True
 496     for path in ignored_paths:
 497         if filename.startswith(path):
 498             return True
 499     if os.path.basename(filename) in ignored_files:
 500         return True
 501     if check_vcs == True and is_versioned(filename) == False:
 502         return True
 503     return False
 504
 505 def _set_contents(filename, contents, original_contents=None, dry_run=False,
 506                   verbose=0):
 507     if original_contents == None and os.path.isfile(filename):
 508         f = open(filename, 'r')
 509         original_contents = f.read()
 510         f.close()
 511     if verbose > 0:
 512         print "checking %s ... " % filename,
 513     if contents != original_contents:
 514         if verbose > 0:
 515             if original_contents == None:
 516                 print "[creating]"
 517             else:
 518                 print "[updating]"
 519         if verbose > 1 and original_contents != None:
 520             print '\n'.join(
 521                 difflib.unified_diff(
 522                     original_contents.splitlines(), contents.splitlines(),
 523                     fromfile=os.path.normpath(os.path.join('a', filename)),
 524                     tofile=os.path.normpath(os.path.join('b', filename)),
 525                     n=3, lineterm=''))
 526         if dry_run == False:
 527             f = file(filename, 'w')
 528             f.write(contents)
 529             f.close()
 530     elif verbose > 0:
 531         print "[no change]"
 532
 533 # Update commands
 534
 535 def update_authors(authors_fn=authors_list, dry_run=False, verbose=0):
 536     authors = authors_fn()
 537     authors = _replace_aliases(authors, with_email=True, aliases=ALIASES)
 538     new_contents = '%s was written by:\n%s\n' % (
 539         PROJECT_INFO['project'],
 540         '\n'.join(authors)
 541         )
 542     _set_contents('AUTHORS', new_contents, dry_run=dry_run, verbose=verbose)
 543
 544 def update_file(filename, original_year_fn=original_year, authors_fn=authors,
 545                 dry_run=False, verbose=0):
 546     f = file(filename, 'r')
 547     contents = f.read()
 548     f.close()
 549
 550     original_year = original_year_fn(filename)
 551     authors = authors_fn(filename)
 552     authors = _replace_aliases(authors, with_email=True, aliases=ALIASES)
 553
 554     new_contents = _update_copyright(contents, original_year, authors)
 555     _set_contents(filename, contents=new_contents, original_contents=contents,
 556                   dry_run=dry_run, verbose=verbose)
 557
 558 def update_files(files=None, dry_run=False, verbose=0):
 559     if files == None or len(files) == 0:
 560         files = []
 561         for dirpath,dirnames,filenames in os.walk('.'):
 562             for filename in filenames:
 563                 files.append(os.path.join(dirpath, filename))
 564
 565     for filename in files:
 566         if ignored_file(filename) == True:
 567             continue
 568         update_file(filename, dry_run=dry_run, verbose=verbose)
 569
 570 def test():
 571     import doctest
 572     doctest.testmod()
 573
 574 if __name__ == '__main__':
 575     import optparse
 576     import sys
 577
 578     usage = """%%prog [options] [file ...]
 579
 580 Update copyright information in source code with information from
 581 the %(vcs)s repository.  Run from the %(project)s repository root.
 582
 583 Replaces every line starting with '^# Copyright' and continuing with
 584 '^#' with an auto-generated copyright blurb.  If you want to add
 585 #-commented material after a copyright blurb, please insert a blank
 586 line between the blurb and your comment, so the next run of
 587 ``update_copyright.py`` doesn't clobber your comment.
 588
 589 If no files are given, a list of files to update is generated
 590 automatically.
 591 """ % PROJECT_INFO
 592     p = optparse.OptionParser(usage)
 593     p.add_option('--test', dest='test', default=False,
 594                  action='store_true', help='Run internal tests and exit')
 595     p.add_option('--dry-run', dest='dry_run', default=False,
 596                  action='store_true', help="Don't make any changes")
 597     p.add_option('-v', '--verbose', dest='verbose', default=0,
 598                  action='count', help='Increment verbosity')
 599     options,args = p.parse_args()
 600
 601     if options.test == True:
 602         test()
 603         sys.exit(0)
 604
 605     update_authors(dry_run=options.dry_run, verbose=options.verbose)
 606     update_files(files=args, dry_run=options.dry_run, verbose=options.verbose)