spacepaste

  1.  
  2. from collections import defaultdict
  3. numRows = videoTimeView.count()
  4. rowStart = 0
  5. rowEnd = 10000
  6. while (rowStart < numRows)
  7. query = "SELECT userid, lessonid, videoid, createdat FROM (SELECT userid, lessonid, videoid, createdat, ROW_NUMBER() OVER (ORDER BY userid, lessonid, videoid, createdat) AS rn FROM videotimeview4) q WHERE rn > %d AND rn < %d ORDER BY userid, lessonid, videoid, createdat" % (rowStart, rowEnd)
  8. ulvdata = hiveContext.sql(query).collect()
  9. datapoints = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
  10. for x in ulvdata:
  11. id = str(x['id'])
  12. userid = str(x['userid'])
  13. lessonid = str(x['lessonid'])
  14. videoid = str(x['videoid'])
  15. sceneid = x['sceneid']
  16. time = str(x['createdat'])[:-5]
  17. datapoints[userid][lessonid][videoid][time].append((sceneid, id))
  18. datapointsToDelete = []
  19. for u, lessons in sorted(datapoints.items()):
  20. for l,video in sorted(lessons.items()):
  21. for v,time in sorted(video.items()):
  22. for t,scenes in sorted(time.items()):
  23. if len(scenes) > 1:
  24. sortedScenes = sorted(scenes, key=lambda tup: tup[0])
  25. datapointsToDelete.extend(sortedScenes[1:])
  26. print u, l, v, t, sortedScenes
  27. # sudo code
  28. # delete datapoints from other db
  29. # cleanup memory
  30. # get next batch of data
  31. rowStart = rowStart + 10000
  32. rowEnd = rowEnd + 10000
  33.