#!/usr/bin/env Rscript
#
# Based on Breen's Example 2: airline
#
library(rmr2)
# assumes 'airline' and airline/data exists on HDFS under user's home directory
hdfs.data.root = 'airline'
hdfs.data = file.path(hdfs.data.root, 'data')
# unless otherwise specified, directories on HDFS should be relative to user's home
hdfs.out.root = hdfs.data.root
hdfs.out = file.path(hdfs.out.root, 'out')
mapper.year.market.enroute_time = function(k, fields) {
# Skip header line in csv formatted file
if (!(as.character(fields[[1]]) == "Year")) {
keyval(as.character(fields[[9]]), 1)
}
}
reducer.year.market.enroute_time = function(key, vv) {
# count values for each key
keyval(key, sum(as.numeric(vv),na.rm=TRUE))
}
mr.year.market.enroute_time = function (input, output) {
mapreduce(input = input,
output = output,
input.format = make.input.format("csv", sep = ","),
map = mapper.year.market.enroute_time,
reduce = reducer.year.market.enroute_time)
}
out = from.dfs(mr.year.market.enroute_time(hdfs.data, hdfs.out))
results.df = as.data.frame(out,stringsAsFactors=F )
colnames(results.df) = c('carrier', 'count')
print(results.df)
Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
2004,3,25,4,848,840,1241,1225,HA,1,N587HA,353,345,218,16,8,LAX,HNL,2556,4,11,0,,0,16,0,0,0,0
2004,3,25,4,1426,1425,2135,2140,HA,2,N592HA,309,315,402,-5,1,HNL,LAX,2556,10,17,0,,0,0,0,0,0,0
2004,3,25,4,1222,1220,1551,1605,HA,3,N583HA,329,345,192,-14,2,LAX,HNL,2556,4,13,0,,0,0,0,0,0,0
2004,3,25,4,2220,2225,524,525,HA,4,N583HA,304,300,400,-1,-5,HNL,LAX,2556,5,19,0,,0,0,0,0,0,0
2004,3,25,4,1016,1010,1431,1430,HA,7,N591HA,375,380,228,1,6,LAS,HNL,2762,3,24,0,,0,0,0,0,0,0
2004,3,25,4,2243,2250,617,615,HA,8,N584HA,334,325,434,2,-7,HNL,LAS,2762,7,13,0,,0,0,0,0,0,0
2004,3,25,4,1717,1725,2046,2110,HA,9,N584HA,329,345,196,-24,-8,LAX,HNL,2556,6,7,0,,0,0,0,0,0,0
Approximately half of the real rows of the file will be "missed" when processed by the script. There is no warning or error message.