Paste #sk4bPE5P9QsKhmcYlsK0 at spacepaste

from collections import defaultdict
numRows = videoTimeView.count()
rowStart = 0
rowEnd = 10000
while (rowStart < numRows)
query = "SELECT userid, lessonid, videoid, createdat FROM (SELECT userid, lessonid, videoid, createdat, ROW_NUMBER() OVER (ORDER BY userid, lessonid, videoid, createdat) AS rn FROM videotimeview4) q WHERE rn > %d AND rn < %d ORDER BY userid, lessonid, videoid, createdat" % (rowStart, rowEnd)
ulvdata = hiveContext.sql(query).collect()
datapoints = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
for x in ulvdata:
id = str(x['id'])
userid = str(x['userid'])
lessonid = str(x['lessonid'])
videoid = str(x['videoid'])
sceneid = x['sceneid']
time = str(x['createdat'])[:-5]
datapoints[userid][lessonid][videoid][time].append((sceneid, id))
datapointsToDelete = []
for u, lessons in sorted(datapoints.items()):
for l,video in sorted(lessons.items()):
for v,time in sorted(video.items()):
for t,scenes in sorted(time.items()):
if len(scenes) > 1:
sortedScenes = sorted(scenes, key=lambda tup: tup[0])
datapointsToDelete.extend(sortedScenes[1:])
print u, l, v, t, sortedScenes
# sudo code
# delete datapoints from other db
# cleanup memory
# get next batch of data
rowStart = rowStart + 10000
rowEnd = rowEnd + 10000

spacepaste