blob: 755b923db77ed8bb8ed39f58478c1b52f47fccb3 [file] [log] [blame]
# Copyright 2010 the Melange authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Appengine Tasks related to GCI Task bulk create."""
import csv
import json
import logging
import StringIO
from HTMLParser import HTMLParseError
from html5lib import HTMLParser
from html5lib import sanitizer
from html5lib.html5parser import ParseError
from google.appengine.ext import db
from google.appengine.ext import ndb
from google.appengine.api import taskqueue
from google.appengine.runtime import DeadlineExceededError
from django import http
from django.conf.urls import url
from melange.logic import profile as profile_logic
from melange.models import profile as profile_model
from soc.tasks.helper import error_handler
from soc.tasks.helper.timekeeper import Timekeeper
from soc.modules.gci.logic.helper import notifications
from soc.modules.gci.models.bulk_create_data import GCIBulkCreateData
from soc.modules.gci.models import program as program_model
from soc.modules.gci.models import task as task_model
from soc.modules.gci.models.task import DifficultyLevel
from soc.modules.gci.models.task import GCITask
from soc.modules.gci.views import task_create as task_create_view
BULK_CREATE_URL = '/tasks/gci/task/bulk_create'
DATA_HEADERS = ['title', 'description', 'time_to_complete', 'mentors',
'task_type', 'tags']
# special task used to specify that the created task is a beginner task
BEGINNER_TASK_TAG = '@beginner'
class BulkCreateTask(object):
"""Request handlers for bulk creating GCITasks.
"""
def djangoURLPatterns(self):
"""Returns the URL patterns for the tasks in this module
"""
patterns = [
url(r'^tasks/gci/task/bulk_create$', self.bulkCreateTasks,
name='gci_bulk_create_task'),
]
return patterns
def bulkCreateTasks(self, request, *args, **kwargs):
"""Task that creates GCI Tasks from bulk data specified in the POST dict.
The POST dict should have the following information present:
bulk_create_key: the key of the bulk_create entity
"""
# keep track of our own timelimit (20 seconds)
timelimit = 20000
timekeeper = Timekeeper(timelimit)
post_dict = request.POST
bulk_create_key = post_dict.get('bulk_create_key')
if not bulk_create_key:
return error_handler.logErrorAndReturnOK(
'Not all POST data specified in: %s' % post_dict)
bulk_data = GCIBulkCreateData.get(bulk_create_key)
if not bulk_data:
return error_handler.logErrorAndReturnOK(
'No valid data found for key: %s' % bulk_create_key)
# note that we only query for the quota once
org_admin = ndb.Key.from_old_key(
GCIBulkCreateData.created_by.get_value_for_datastore(bulk_data)).get()
org_key = ndb.Key.from_old_key(
GCIBulkCreateData.org.get_value_for_datastore(bulk_data))
org = org_key.get()
# TODO(ljvderijk): Add transactions
tasks = bulk_data.tasks
while len(tasks) > 0:
try:
# check if we have time
timekeeper.ping()
# remove the first task
task_as_string = tasks.pop(0)
loaded_task = json.loads(task_as_string)
task = {}
for key, value in loaded_task.iteritems():
# If we don't do this python will complain about kwargs not being
# strings when we try to save the new task.
task[key.encode('UTF-8')] = value
logging.info('Uncleaned task: %s', task)
# clean the data
errors = self._cleanTask(task, org)
if errors:
logging.warning(
'Invalid task data uploaded, the following errors occurred: %s',
errors)
bulk_data.errors.append(db.Text(
'The task in row %i contains the following errors.\n %s' \
% (bulk_data.tasksRemoved(), '\n'.join(errors))))
# at-most-once semantics for creating tasks
bulk_data.put()
if errors:
# do the next task
continue
# set other properties
task['org'] = org_key.to_old_key()
# TODO(daniel): access program in more efficient way
task['program'] = org_admin.program.to_old_key()
task['status'] = task_model.UNPUBLISHED
task['created_by'] = org_admin.key.to_old_key()
task['modified_by'] = org_admin.key.to_old_key()
# TODO(ljv): Remove difficulty level completely if needed.
# Difficulty is hardcoded to easy since GCI2012 has no difficulty.
task['difficulty_level'] = DifficultyLevel.EASY
subscribers = task['mentor_entities'] + [org_admin]
task['subscribers'] = list(set(
entity.key.to_old_key() for entity in subscribers
if entity.notification_settings.task_updates))
# create the new task
logging.info('Creating new task with fields: %s', task)
task_entity = GCITask(**task)
task_entity.put()
except DeadlineExceededError:
# time to bail out
break
if len(tasks) == 0:
# send out a message
notifications.sendBulkCreationCompleted(bulk_data)
bulk_data.delete()
else:
# there is still work to be done, do a non 500 response and requeue
task_params = {
'bulk_create_key': bulk_data.key()
}
new_task = taskqueue.Task(params=task_params, url=BULK_CREATE_URL)
# add to the gci queue
new_task.add(queue_name='gci-update')
# we're done here
return http.HttpResponse('OK')
def _cleanTask(self, task, org):
"""Cleans the data given so that it can be safely stored as a task.
Args:
task: Dictionary as constructed by the csv.DictReader().
org: org_model.Organization entity for which the task is created.
Returns:
A list of error messages if any have occurred.
"""
errors = []
# check title
if not task['title']:
errors.append('No valid title present.')
# clean description
try:
parser = HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
parsed = parser.parseFragment(task['description'], encoding='utf-8')
cleaned_string = ''.join([tag.toxml() for tag in parsed.childNodes])
task['description'] = cleaned_string.strip().replace('\r\n', '\n')
except (HTMLParseError, ParseError, TypeError) as e:
logging.warning('Cleaning of description failed with: %s', e)
errors.append(
'Failed to clean the description, do not use naughty HTML such as '
'<script>.')
# clean time to complete
try:
hours_to_complete = int(task['time_to_complete'])
min_task_time_to_complete_hours = (
task_create_view.MIN_TASK_TIME_TO_COMPLETE_DAYS * 24)
max_task_time_to_complete_hours = (
task_create_view.MAX_TASK_TIME_TO_COMPLETE_DAYS * 24)
if hours_to_complete < min_task_time_to_complete_hours:
errors.append('Time to complete must be at least %d days (%d hours), '
'given was: %s'
% (task_create_view.MIN_TASK_TIME_TO_COMPLETE_DAYS,
min_task_time_to_complete_hours,
hours_to_complete))
elif hours_to_complete > max_task_time_to_complete_hours:
errors.append('Time to complete must be no more than %d days '
'(%d hours), given was: %s'
% (task_create_view.MAX_TASK_TIME_TO_COMPLETE_DAYS,
max_task_time_to_complete_hours,
hours_to_complete))
else:
task['time_to_complete'] = hours_to_complete
except (ValueError, TypeError) as e:
errors.append('No valid time to completion found, given was: %s.'
% task['time_to_complete'])
# clean mentors
mentor_ids = set(task['mentors'].split(','))
mentors = []
mentor_entities = []
for mentor_id in mentor_ids:
mentor = profile_logic.getProfileForUsername(
mentor_id.strip(), org.program.to_old_key())
if mentor and mentor.status == profile_model.Status.ACTIVE:
mentors.append(mentor.key.to_old_key())
mentor_entities.append(mentor)
else:
errors.append('%s is not an active mentor.' % mentor_id)
task['mentors'] = mentors
task['mentor_entities'] = mentor_entities
program_entity = program_model.GCIProgram.get(org.program.to_old_key())
# clean task types
types = []
for task_type in set(task['task_type'].split(',')):
task_type = task_type.strip()
if task_type in program_entity.task_types:
types.append(task_type)
else:
errors.append('%s is not a valid task type.' % task_type)
task['types'] = types
# clean task tags
tags = []
for tag in set(task['tags'].split(',')):
tag = tag.strip()
if tag == BEGINNER_TASK_TAG:
task['is_beginner'] = True
else:
tags.append(tag)
task['tags'] = tags
return errors
def spawnBulkCreateTasks(data, org_key, org_admin):
"""Spawns a task to bulk post the given data.
The data given to this method should be in CSV format with the following
columns:
title, description, time_to_complete, mentors, difficulty, task_type,
tags
Fields where multiple values are allowed should be comma separated as well.
These fields are task_type, tags and mentors. Rows of data which can not
be properly resolved to form a valid Task will be safely ignored.
Args:
data: string with data in csv format
org_key: ndb.Key of the organization for which the task is created.
org_admin: GCIProfile of the org admin uploading these tasks
"""
data = StringIO.StringIO(data.encode('UTF-8'))
tasks = csv.DictReader(data, fieldnames=DATA_HEADERS, restval="")
task_list = []
for task in tasks:
# pop any extra columns
task.pop(None, None)
task_list.append(db.Text(json.dumps(task)))
bulk_data = GCIBulkCreateData(
tasks=task_list, org=org_key.to_old_key(),
created_by=org_admin.key.to_old_key(),
total_tasks=len(task_list))
bulk_data.put()
task_params = {
'bulk_create_key': bulk_data.key()
}
logging.info('Enqueued bulk_create with: %s', task_params)
new_task = taskqueue.Task(params=task_params, url=BULK_CREATE_URL)
new_task.add()