| #!/usr/bin/env python |
| |
| import datetime |
| import interactive # pylint: disable=relative-import |
| import itertools |
| import sys |
| import zipfile |
| |
| |
| def _batch(iterable, size=100): |
| """Returns an iterator for the given iterable object that yields iterators |
| to go through the specified sequence in batches. |
| |
| Example usage: |
| sequence = xrange(11) |
| for batchit in _batch(sequence, size=3): |
| for item in batchit: |
| print item, |
| print |
| |
| Will output: |
| 0 1 2 |
| 3 4 5 |
| 6 7 8 |
| 9 10 |
| |
| Args: |
| iterable: An iterable object: |
| size: An int which specifies size of a single batch. |
| |
| Returns: |
| An iterator which returns iterators that split the given collection |
| in batches. |
| """ |
| global_it = iter(iterable) |
| while True: |
| next_slice_it = itertools.islice(global_it, size) |
| yield itertools.chain([next_slice_it.next()], next_slice_it) |
| |
| |
| def sanitize_html(fragment): |
| """Returns sanitized HTML fragment. |
| |
| Args: |
| fragment: A string containing HTML fragment. |
| |
| Returns: |
| A string containing sanitized HTML fragment. |
| """ |
| from html5lib import html5parser |
| from melange.utils import htmlsanitizer |
| return ''.join( |
| token.toxml() for token in |
| (html5parser.HTMLParser(tokenizer=htmlsanitizer.HTMLSanitizer) |
| .parseFragment(fragment) |
| .childNodes)) |
| |
| def sanitizeProposals(): |
| from summerofcode.models import proposal as proposal_model |
| |
| it = interactive.deepFetchNDB(lambda: proposal_model.Proposal.query()) |
| for proposal in it: |
| file_name = proposal.key.urlsafe() |
| if len(file_name) > 50: |
| file_name = file_name[:50] |
| |
| oldFile = open('%s.old' % file_name, 'w') |
| oldFile.write(proposal.content.encode('utf8')) |
| oldFile.close() |
| |
| newFile = open('%s.new' % file_name, 'w') |
| newFile.write(sanitize_html(proposal.content).encode('utf8')) |
| newFile.close() |
| |
| |
| def sampleProposals(key_strings): |
| from google.appengine.ext import ndb |
| from summerofcode.models import proposal as proposal_model |
| |
| proposals = ndb.get_multi( |
| ndb.Key(urlsafe=key_string) for key_string in key_strings) |
| for proposal in proposals: |
| file_name = proposal.key.urlsafe() |
| if len(file_name) > 50: |
| file_name = file_name[:50] |
| |
| oldFile = open('%s.old' % file_name, 'w') |
| oldFile.write(proposal.content.encode('utf8')) |
| oldFile.close() |
| |
| newFile = open('%s.new' % file_name, 'w') |
| newFile.write(sanitize_html(proposal.content).encode('utf8')) |
| newFile.close() |
| |
| |
| def sanitizeOrganizationDescription(dry_run=True): |
| """Sanitizes content of descriptions of existing organizations. The old |
| descriptions are saved in a zip file. |
| |
| Args: |
| dry_run: A bool specifying whether the function is run in a dry run mode |
| or not. |
| """ |
| from melange.models import organization as org_model |
| |
| zipname = 'org-descriptions-%s.zip' % ( |
| datetime.datetime.utcnow().strftime('%Y-%m-%d-%H:%M')) |
| with zipfile.ZipFile(zipname, 'w') as zfile: |
| # pylint: disable=print-statement |
| print 'Existing content will be saved in %s file.' % zipname |
| # pylint: enable=print-statement |
| it = interactive.deepFetchNDB(lambda: org_model.Organization.query()) |
| for org in it: |
| # add current description to a zip file |
| zfile.writestr( |
| '%s.txt' % org.key.id().replace('/', '-'), |
| org.description.encode('utf-8'), zipfile.ZIP_DEFLATED) |
| |
| org.description = sanitize_html(org.description) |
| if not dry_run: |
| org.put() |
| |
| |
| def sanitizeProposalContent(dry_run=True): |
| """Sanitizes content of the latest revision of each existing proposal. |
| The old content is saved in a zip file. |
| |
| Args: |
| dry_run: A bool specifying whether the function is run in a dry run mode |
| or not. |
| """ |
| from google.appengine.ext import ndb |
| from summerofcode.models import proposal as proposal_model |
| |
| zipname = 'proposals-%s.zip' % ( |
| datetime.datetime.utcnow().strftime('%Y-%m-%d-%H:%M')) |
| with zipfile.ZipFile(zipname, 'w') as zfile: |
| # pylint: disable=print-statement |
| print 'Existing content will be saved in %s file.' % zipname |
| # pylint: enable=print-statement |
| it = interactive.deepFetchNDB(lambda: proposal_model.Proposal.query()) |
| for batch_it in _batch(it): |
| to_put = [] |
| |
| revision_keys = [proposal.latest_revision for proposal in batch_it] |
| revisions = ndb.get_multi(revision_keys) |
| for revision in revisions: |
| sanitized_content = sanitize_html(revision.content) |
| if revision.content != sanitized_content: |
| # add current revision content to a zip file |
| zfile.writestr( |
| '%s-%d-rev%s.txt' % ( |
| proposal.key.parent().id().replace('/', '-'), |
| proposal.key.id(), revision.key.id()), |
| revision.content.encode('utf-8'), zipfile.ZIP_DEFLATED) |
| |
| revision.content = sanitized_content |
| to_put.append(revision) |
| |
| if not dry_run: |
| ndb.put_multi(to_put) |
| |
| |
| def main(): |
| interactive.setup() |
| interactive.setDjango() |
| |
| context = { |
| 'sod': sanitizeOrganizationDescription, |
| 'sp': sanitizeProposals, |
| 'spc': sanitizeProposalContent, |
| 'sample': sampleProposals |
| } |
| |
| interactive.remote(sys.argv[1:], context) |
| interactive.main() |
| |
| if __name__ == '__main__': |
| if len(sys.argv) < 2: |
| # pylint: disable=print-statement |
| print "Usage: %s app_id [host]" % (sys.argv[0],) |
| sys.exit(1) |
| # pylint: disable=print-statement |
| print ("Now you're at Python prompt, and you probably want to re-import " |
| "this script and run functions from it") |
| main() |