revolutionanalytics / dplyrxdf Goto Github PK

View Code? Open in Web Editor NEW

39.0 39.0 16.0 853 KB

dplyr backend for Revolution Analytics xdf files

License: Other

R 100.00%

dplyrxdf's People

Contributors

Stargazers

Watchers

Forkers

esparza83 jaredlander irichgreen pachoalvarez sethips radovankavicky ktargows svaditya yueguoguo ashishchandan ktaranov akzaidi nemochina2008 nikolayvoronchikhin aswaniarisetty

dplyrxdf's Issues

Column selection functions don't work

> mtx <- rxDataStep(mtcars, "mtx.xdf")
> names(mtx)
 [1] "mpg"  "cyl"  "disp" "hp"   "drat" "wt"   "qsec" "vs"   "am"   "gear" "carb"
> select(mtx, starts_with("d"))
Error in eval(expr, envir, enclos) : 
  could not find function "starts_with"

Exact class comparisons cause tbl_xdf inputs to fail in Spark CC

Specifically else if (class(data) %in% c("RxTextData", "RxXdfData", "RxParquetData", "RxOrcData"))

Full function source:

> RevoScaleR:::rxCheckSupportForDataSource
function (computeContext, jobType, data, isOutData = FALSE) 
{
    if (is.null(data)) {
        return(TRUE)
    }
    if (isOutData) {
        inout.datasource <- "output data source"
    }
    else {
        inout.datasource <- "input data source"
    }
    if (class(computeContext) == "RxHadoopMR") {
        if (inherits(data, "RxSparkData")) {
            msg <- paste(class(data), "is supported in RxSpark compute context only")
            stop(msg, call. = FALSE)
        }
    }
    else if (class(computeContext) == "RxSpark") {
        if (class(data) == "character") {
            stop(paste(data, "is considered as local file. Data source used in RxSpark compute context must be in hdfs file system."), 
                call. = FALSE)
        }
        else if (class(data) %in% c("RxTextData", "RxXdfData", 
            "RxParquetData", "RxOrcData")) {
            if (data@fileSystem$fileSystemType != "hdfs") {
                if (grepl("://", data@file)) {
                  message(class(data), "specifying a hdfs fileSystem is recommended")
                }
                else {
                  msg <- paste(class(data), "as", inout.datasource, 
                    "in RxSpark compute context must be in hdfs file system")
                  stop(msg, call. = FALSE)
                }
            }
            if (is(data, "RxXdfData") && isTRUE(isOutData) && 
                tolower(rxFileExtension(data@file)) == "xdf") {
                msg <- paste(data@file, "has extension '.xdf', which is considered as single XDF and not supported in RxHadoopMR and RxSpark compute context")
                stop(msg, call. = FALSE)
            }
            if (is(data, "RxXdfData") && isTRUE(isOutData) && 
                is.logical(data@createCompositeSet) && data@createCompositeSet == 
                FALSE) {
                stop("The `createCompositeSet` argument cannot be set to FALSE in RxHadoopMR and RxSpark compute context.", 
                  call. = FALSE)
            }
            if (!is(data, "RxParquetData") && !is(data, "RxOrcData")) {
                cc.nameNode <- computeContext@nameNode
                data.hostName <- data@fileSystem$hostName
                if (grepl("://", cc.nameNode) || grepl("://", 
                  data.hostName)) {
                  if (cc.nameNode != data.hostName) {
                    msg <- paste(class(data), "data source and RxSpark compute context have different hdfs (default/azure blob/azure data lake). data source:", 
                      cc.nameNode, ", compute context:", data.hostName)
                    stop(msg, call. = FALSE)
                  }
                }
            }
        }
        else if (class(data) == "RxHiveData") {
            if (isOutData && data@dfType == "hive") {
                msg <- paste("Cannot use RxHiveData with query as", 
                  inout.datasource, ". Please use RxHiveData with table.")
                stop(msg, call. = FALSE)
            }
        }
        else {
            msg <- paste(class(data), "as", inout.datasource, 
                "in RxSpark compute context is not supported")
            stop(msg, call. = FALSE)
        }
        if (inherits(data, "RxSparkData") && identical(jobType, 
            "hpc")) {
            stop("RxSparkData is not supported in HPC (rxExec) mode", 
                call. = FALSE)
        }
    }
    else {
        if (inherits(data, "RxSparkData")) {
            msg <- paste(class(data), "is supported in RxSpark compute context only")
            stop(msg, call. = FALSE)
        }
        return(TRUE)
    }
    TRUE
}
<environment: namespace:RevoScaleR>

Allow factorise to modify factor levels

Persist Fails When Overwriting

I haven't thoroughly troubleshooted this but ran into the error today.

I am trying to use mutate to create a new variable then overwrite the file.

download.file('http://www.jaredlander.com/data/acs_ny.csv',
              destfile='data/acs_ny.csv')

library(dplyrXdf)

acs <- acs %>% 
    mutate(Income=FamilyIncome >= 120000) %>% 
    persist(outFile=acs, overwrite=TRUE)

Then I get this error:

Error in to[okay] : object of type 'S4' is not subsettable

rxDataStep cannot change type of existing variable

Even when writing to a new file:

> mtx <- rxDataStep(mtcars, "mtcars.xdf", overwrite=TRUE)

> z <- rxDataStep(mtx, "mtcarsz.xdf", transforms=list(am=as.character(am)), overwrite=TRUE)

> rxGetVarInfo(z)
Var 1: mpg, Type: numeric, Low/High: (10.4000, 33.9000)
Var 2: cyl, Type: numeric, Low/High: (4.0000, 8.0000)
Var 3: disp, Type: numeric, Low/High: (71.1000, 472.0000)
Var 4: hp, Type: numeric, Low/High: (52.0000, 335.0000)
Var 5: drat, Type: numeric, Low/High: (2.7600, 4.9300)
Var 6: wt, Type: numeric, Low/High: (1.5130, 5.4240)
Var 7: qsec, Type: numeric, Low/High: (14.5000, 22.9000)
Var 8: vs, Type: numeric, Low/High: (0.0000, 1.0000)
Var 9: am, Type: numeric, Low/High: (0.0000, 1.0000)  <--- wrong
Var 10: gear, Type: numeric, Low/High: (3.0000, 5.0000)
Var 11: carb, Type: numeric, Low/High: (1.0000, 8.0000)

Installation Issue

Hi there! I seem to be having issues installing the dplyrXdf package into RRE. I tried installing the package locally and from GitHub directly and I get the same issue:

` devtools::install_github("RevolutionAnalytics/dplyrXdf")
Downloading GitHub repo RevolutionAnalytics/dplyrXdf@master
from URL https://api.github.com/repos/RevolutionAnalytics/dplyrXdf/zipball/master
Installing dplyrXdf
Skipping 2 unavailable packages: RevoPemaR, RevoScaleR
"C:/PROGRA~~1/RRO/R-31~~1.3/bin/x64/R" --no-site-file --no-environ --no-save
--no-restore --quiet CMD INSTALL
"C:/Users/ionnoan/AppData/Local/Temp/13/RtmpAHjaHH/devtoolsdec5c752420/RevolutionAnalytics-dplyrXdf-1f70248"
--library="\OEBFS05/Home/IonnoAn/R/win-library/3.1" --install-tests

installing source package 'dplyrXdf' ...
** R
** inst
** preparing package for lazy loading
** help
*** installing help indices
converting help for package 'dplyrXdf'
finding HTML links ... done
as.data.frame html
deleteXdfTbls html
distinct html
do html
dplyrXdf html
finding level-2 HTML links ... done

factorise html
group_by html
groups html
mutate html
persist html
rxArgs html
setops html
summarise html
tbl html
tbl_vars html
** building package indices
** installing vignettes
** testing if installed package can be loaded
Warning in library(pkg_name, lib.loc = lib, character.only = TRUE, logical.return = TRUE) :
there is no package called 'dplyrXdf'
Error: loading failed
Execution halted
ERROR: loading failed
removing '\OEBFS05/Home/IonnoAn/R/win-library/3.1/dplyrXdf'
Error: Command failed (1)`

rxDataStep with data frame input and HDFS output in remote Spark CC hangs R

hd <- RxHdfsFileSystem()
mthc <- RxXdfData("/user/sshuser/mtcarsc", fileSystem=hd, createCompositeSet=TRUE)
rxDataStep(mtcars, mthc)
#Warning: System error: Attempting to load HDFS functions in unsupported environment. (Failed to load C:/PROGRA~1/MIE74D~1/RSERVE~1/R_SERVER\library\RevoScaleR\rxLibs\x64\hdfs.dll)
#Error in try(rxCall("RxHdfsConnect", params)) : 
  #System error: Attempting to load HDFS functions in unsupported environment. (Failed to load C:/PROGRA~1/MIE74D~1/RSERVE~1/R_SERVER\library\RevoScaleR\rxLibs\x64\hdfs.dll)
#In addition: 
#Warning message:
#The input is R dataframe. Run the job using local compute context instead. In order to parallelize execution in Spark, please convert it to composite XDF format first.

> hd <- RxHdfsFileSystem()
> mthc <- RxXdfData("/user/sshuser/mtcarsc", fileSystem=hd, createCompositeSet=TRUE)
> rxFactors(mthc, factorInfo="cyl", outFile=RxXdfData("/user/sshuser/factout", fileSystem=hd))
RxXdfData Source
"/user/sshuser/factout"
**fileSystem:**
**    fileSystemType: native**

Spurious warning from tibble::as_data_frame inside rxExec

library(RevoScaleR)
library(tibble)
rxExec(as_data_frame, mtcars)
#Warning message:
#Unknown or uninitialised column: 'call'.

Also filed at tidyverse/tibble#265

f <- function(x)
{
    x + y
}

g <- function(x, y)
{
    lst <- list(y=y)
    rxExec(f, rxElemArg(x), execObjects=list2env(lst))
}

g(1:4, 1)

This fails with localpar and dopar CCs:

f2 <- function(x)
{
    x + .y  # object name starting with .
}


g2 <- function(x, y)
{
    lst <- list(.y=y)  # object name starting with .
    rxExec(f2, rxElemArg(x), execObjects=list2env(lst))
}

g2(1:4, 1)
#Error in do.call(.rxDoParFUN, as.list(args)) : 
  #task 1 failed - "object '.y' not found"

mutate(xdf, y=x1 + x2)                        # default: xdf tbl           
mutate(xdf, y=x1 + x2, .output=NULL)          # data frame
mutate(xdf, y=x1 + x2, .output="output.xdf")  # persistent xdf

summarise() not removing one grouping level

Reported by Sam Kemp, symptom is subsequent verbs act as if data is still grouped

tbl_xdf() doesn't set attributes correctly

This affects rxExecBy in HDFS

xdfUuid
dfName
dfSource

I’ve been keeping tabs on the dplyrXdf package you’re been publishing, and I think it’s awesome. I saw your recent blog post about some updates, and I’m really glad it’s being actively enhanced. However, there is one thing that is preventing us from leveraging it and I wanted to get your thoughts on it.

When tried it out a couple months ago, it appears to create temporary xdf files in the /tmp directory by default. I’ve worked for several financial services companies, and at all of them, /tmp has never been a Linux file system that we can expand tremendously. Also, filling it up can cause some bad things to happen depending on what you’re doing.

Do you happen to know if there are any plans to allow an end user to set what temp/working directory the package should use to a default other than tmp? I think dplyrXdf is a key package to unlock a bunch of adoption problems with ScaleR, and a change to repoint it to a directory where we can get more space would help us get a tremendous amount of value on it.

rxFactors in Spark CC requires explicitly specifying the factor levels

This fails:

> out <- RxXdfData("/user/sshuser/factout", fileSystem=hd)
> rxFactors(mthc, factorInfo="cyl", outFile=out)
> rxGetVarInfo(out)
Var 1: mpg, Type: numeric, Low/High: (10.4000, 33.9000)
Var 2: cyl
       0 factor levels:

This works:

> rxFactors(mthc, factorInfo=list(cyl=list(levels=c("4", "6", "8"))), outFile=out, overwrite=TRUE)
Var 1: mpg, Type: numeric, Low/High: (10.4000, 33.9000)
Var 2: cyl
       3 factor levels: 4 6 8

doXdf issue with 0.7

When using doXdf with version 0.7, I'm getting the following error:

flightsMods <- flightsXdf %>%
     group_by(carrier) %>%
     doXdf(model=rxLinMod(arr_delay ~ dep_delay + hour, data=.))
Error in rxGetFileSystem(.data) : object 'x' not found

Can you see if anything broke in the last version when RxLocalParallel support was added?

Thanks.

Row appending with rxMerge very slow

rxMerge scales quadratically with the total no. of rows when doing an rbind operation:

mergeBind <- function(lst)
{
    out <- RxXdfData(tempfile(fileext=".xdf"))
    system.time(
    {
        rxMerge(lst, outFile=out, type="union", overwrite=TRUE)
    })
}

rxOptions(reportProgress=0)
nyc <- RxXdfData("nyctaxi_sample.xdf") # NYC taxi data as used in the DSVM

mergeBind(list(nyc, nyc))
#user  system elapsed 
#0.00    0.00   27.65 

mergeBind(list(nyc, nyc, nyc))
#user system elapsed
#0.04   0.03   68.37

mergeBind(list(nyc, nyc, nyc, nyc))
#user system elapsed
#0.08   0.00  120.94

rxDataStep by contrast is linear:

## rxDataStep loop
loopBind <- function(lst)
{
    out <- RxXdfData(tempfile(fileext=".xdf"))
    system.time(
    {
        for(xdf in lst)
            rxDataStep(xdf, out, append=if(file.exists(out@file)) "rows" else "none")
    })
}

loopBind(list(nyc, nyc))
#user  system elapsed 
#0.56    0.01   13.69 

loopBind(list(nyc, nyc, nyc))
#user  system elapsed 
#0.16    0.02   20.36 

loopBind(list(nyc, nyc, nyc, nyc))
#user  system elapsed 
#0.23    0.01   27.25

rxExecBy in Spark ignores varsToKeep in xdf data source

In local compute context, with data on native filesystem:

rxDataStep(mtcars, "mtcars.xdf", overwrite=TRUE)
mtx <- RxXdfData("mtcars.xdf", varsToKeep=c("cyl", "gear"))
rxExecBy(mtx, "cyl", function(keys, data) names(data))
#$P1
#$P1$keys
#$P1$keys[[1]]
#[1] 4

#$P1$result
#[1] "cyl" "gear"   <--- OK

#$P1$status
#[1] "OK"
#...

In local CC, with data in HDFS:

hd <- RxHdfsFileSystem()
mthc <- dplyrXdf::copy_to_hdfs(mtcars)
mthc2 <- RxXdfData(mthc@file, fileSystem=hd, createCompositeSet=TRUE, varsToKeep=c("cyl", "gear"))
rxExecBy(mthc2, "cyl", function(keys, data) names(data))
# same result as before

In Spark CC, with data in HDFS:

rxSetComputeContext(RxSpark())
rxExecBy(mthc2, "cyl", function(keys, data) names(data))
#[[1]]
#[[1]]$keys
#[[1]]$keys[[1]]
#[1] 4

#[[1]]$result
 #[1] "mpg"         "cyl"         "disp"        "hp"          "drat"  <--- all columns read in
 #[6] "wt"          "qsec"        "vs"          "am"          "gear"
#[11] "carb"        ".rxRowNames"

do() gives errors with multiple named args

In local CC:

> do(mtx, m=lm(mpg ~ ., data=.))
# A tibble: 1 x 1
         m
    <list>
1 <S3: lm>

> do(mtx, m=lm(mpg ~ ., data=.), w=lm(wt ~ ., data=.))
Error in is.data.frame(data) : object '.' not found

In Spark CC:

> do(mthc, m=lm(mpg ~ ., data=.))
# A tibble: 1 x 1
         m
    <list>
1 <S3: lm>

> mthc %>% group_by(cyl) %>% do(m=lm(mpg ~ ., data=.))
Source: local data frame [3 x 2]
Groups: <by row>

# A tibble: 3 x 2
    cyl        m
* <dbl>   <list>
1     6 <S3: lm>
2     4 <S3: lm>
3     8 <S3: lm>

> do(mthc, m=lm(mpg ~ ., data=.), w=lm(wt ~ ., data=.))
Error in is.data.frame(data) : object '.' not found

> mthc %>% group_by(cyl) %>% do(m=lm(mpg ~ ., data=.), w=lm(wt ~ ., data=.))
[[1]]
[[1]]$keys
[[1]]$keys[[1]]
[1] 4


[[1]]$result
NULL

[[1]]$status
[[1]]$status[[1]]
[1] "Error"

[[1]]$status[[2]]
[1] "unable to find required package ÔÇÿdplyrXdfÔÇÖ"

[[1]]$status[[3]]
[1] "there is no package called ÔÇÿdplyrXdfÔÇÖ"

> mthc <- dplyrXdf::copy_to_hdfs(mtcars, "/user/sshuser/mtcarsc")
> rxFactors(mthc, list(cyl=list(levels=c(4, 6, 8))), outFile=NULL)

The file or directory /user/sshuser/mtcarsc cannot be found.
Error in doTryCatch(return(expr), name, parentenv, handler) :
  The file or directory /user/sshuser/mtcarsc cannot be found.

rxSummary transforms(n=1) fails if call has no references to existing vars

These fail:

mtx <- rxDataStep(mtcars, "mtcars.xdf", overwrite=TRUE)

rxSummary(~n, mtx, transforms=list(n=1L))
rxSummary(~n, mtx, transforms=list(n=rep(1L, .rxNumRows)))

These work:

rxSummary(~n, mtx, transforms=list(n=1L, m=mpg))  # throwaway variable
rxSummary(~n, mtx, transforms=list(n=rep(1L, length(mpg))))
rxSummary(~n:F(am), mtx, transforms=list(n=1L))

devtools::install_github("RevolutionAnalytics/dplyrXdf")

devtools::install_github("RevolutionAnalytics/dplyrXdf")
Downloading GitHub repo RevolutionAnalytics/dplyrXdf@master
Error in curl::curl_fetch_memory(url, handle = handle) :
Timeout was reached

thank you in advice

Renaming fails on composite Xdf if rownames present

# explicitly create a composite Xdf (directory)
mtc <- RxXdfData("mtcarsc", createCompositeSet=TRUE)
rxDataStep(mtcars, mtc, overwrite=TRUE)

names(mtc)[1] <- "mpg2"
# In 'UpdateMetaInfo()' the two headers have different numbers of variables: 12 vs 11.
# Error in doTryCatch(return(expr), name, parentenv, handler) : 
#   In 'UpdateMetaInfo()' the two headers have different numbers of variables: 12 vs 11.

Joining on multiple unmatched factor variables fails

df1 <- data.frame(A=c("a", "b", "c", "d", "e"), B=c("k", "l", "m", "n", "o"), C=c(1, 2, 3, 4, 5))
df2 <- data.frame(E=c("a", "b", "c", "r", "w"), F=c("z", "l", "m", "n", "o"), G=c(6, 7, 8, 9, 10))
df1_rx <- rxImport(inData=df1, outFile="misc/df1.xdf", overwrite=TRUE)
df2_rx <- rxImport(inData=df2, outFile="misc/df2.xdf", overwrite=TRUE)

inner_join(df1_rx, df2_rx, by=c("A"="E", "B"="F"))
#Error in doTryCatch(return(expr), name, parentenv, handler) : 
#  attempt to set an attribute on NULL

Duplicate columns in varsToKeep throws error

mtx <- rxDataStep(mtcars, "mtcars.xdf", overwrite=TRUE)

rxDataStep(mtx, varsToKeep=c("mpg", "cyl", "cyl"))

#Failed to allocate 1804812534147383619 bytes.
#Caught exception in file: CxDataFile.cpp, line: 3190. ThreadID: 27540 Rethrowing.
#Caught exception in file: CxAnalysis.cpp, line: 6666. ThreadID: 27540 Rethrowing.
#Caught exception in file: CxAnalysis.cpp, line: 5837. ThreadID: 27540 Rethrowing.
#Caught exception in file: CxAnalysis.cpp, line: 5386. ThreadID: 27540 Rethrowing.
#Error in doTryCatch(return(expr), name, parentenv, handler) : 
#  bad allocation

Add collect/compute methods

For completeness/consistency with other data sources, from @andreaspano.

rxFactors fails with relative paths in Spark CC

## set compute context to RxSpark
hd <- RxHdfsFileSystem()
mth <- dplyrXdf::copy_to_hdfs(mtcars)
out <- RxXdfData("ftest", fileSystem=hd, createCompositeSet=TRUE)
rxFactors(mth, outFile=out, factorInfo="am")

#Job failed, the last 20 lines of the log are shown below:
    #Internal Error: Fail to execute Hadoop/Spark job. 
    #Caught exception in file: /builddir/ExaRoot/ExaCore/CxAnalysis.cpp, line: 5837. ThreadID: 233486464 Rethrowing. 
    #Error in try({ :  
    #Traceback (most recent call last): 
      #File "/usr/lib64/microsoft-r/3.3/lib64/R/library/RevoScaleR/pythonScripts/common//logScaleR.py", line 1, in <module> 
        #from hdinsight_common import hdinsightlogging 
      #File "/usr/local/lib/python2.7/dist-packages/hdinsight_common/hdinsightlogging.py", line 8, in <module> 
        #import utilities 
      #File "/usr/local/lib/python2.7/dist-packages/hdinsight_common/utilities.py", line 10, in <module> 
        #import Constants 
      #File "/usr/local/lib/python2.7/dist-packages/hdinsight_common/Constants.py", line 3, in <module> 
        #from hdinsight_role_env import RoleEnv 
      #File "/usr/lib/python2.7/dist-packages/hdinsight_role_env/RoleEnv.py", line 3, in <module> 
        #from watchdog.observers import Observer   
    #ImportError: No module named watchdog.observers 
    #Error:  Error in try({ :  
      
    #======  ed1-hdi21 (Master HPA Process) has completed run at Tue Jul 11 20:32:36 2017  ====== 
    #>  
    #>  
#For the complete log, please refer to the log file C:\Users\hongooi\AppData\Local\Temp\MRSArchive\MRSLog-e6831631a5.log 

#Error in rxuHandleClusterJobTryFailure(retObject, hpcServerJob, autoCleanup) : 
  #Error completing job on cluster:
#Error in try({ :

Whereas this works:

out <- RxXdfData("/user/sshuser/ftest", fileSystem=hd, createCompositeSet=TRUE)
rxFactors(mth, outFile=out, factorInfo="am")

Unnamed arguments for do/doXdf

Can this be done?

rxExecBy in HDFS hangs when input file is in /tmp dir

# make a copy of mtcars
hd <- RxHdfsFileSystem()
mthc <- copy_to(hd, mtcars)

# works
out <- RxXdfData("/user/sshuser/out", fileSystem=hd, createCompositeSet=TRUE)
rxDataStep(mthc, out, varsToKeep="cyl")
rxExecBy(out, "cyl", function(keys, data) { })

# works
out2 <- RxXdfData("/user/sshuser/out2", fileSystem=hd, createCompositeSet=TRUE)
rxDataStep(mthc, out2, varsToKeep=c("cyl", "gear"))
rxExecBy(out2, "cyl", function(keys, data) { })
rxExecBy(out2, c("cyl", "gear"), function(keys, data) { })

# doesn't work - composite xdf location is "/tmp/dxTmp885c184c2ad8/file885c26d37544"
out3 <- as(tbl_xdf(mthc), "RxXdfData")
rxDataStep(mthc, out3, varsToKeep="cyl")
rxExecBy(out3, "cyl", function(keys, data) { })

Refactoring: remove pema dependency

rxExecBy in local CC gives bad result if varsToKeep specified

mt1 <- RxXdfData("misc/mtcars.xdf", varsToKeep="cyl")
rxExecBy(mt1, "cyl", function(keys, data) head(data))

#$P1
#$P1$keys
#$P1$keys[[1]]
#$P1$keys[[1]][[1]]
#RxXdfData Source
#"d:\misc\Rtemp\Rtmpy25tZ1\tmpPartitions8a8855615e7a\data\tmpPartitions8a8855615e7a_5006475794136178589.xdf"
#fileSystem: 
    #fileSystemType: native


#$P1$keys[[2]]
#data frame with 0 columns and 1 row


#$P1$result
  #cyl
#1   6
#2   6
#3   4
#4   6
#5   8
#6   6

#$P1$status
#[1] "OK"

Group by and mutate with .rxArgs does no work in parallel

I'm trying to do a group_by then mutate with a custom function. It works in the local compute context but not parallel.

Below is a reproducible example using the diamonds data.

Tested this on multiple machines.

Any insights?

# rxSetComputeContext('RxLocalParallel')
rxSetComputeContext('RxLocalSeq')
rxOptions('numCoresToUse'=-1)

library(dplyrXdf)
data(diamonds, package='ggplot2')

dia <- RxXdfData('diamonds.xdf')
rxDataStep(diamonds, outFile=dia, overwrite=TRUE, rowsPerRead=15000)

makeBucketDF <- function(dataList)
{
    getBucketCuts <- function(x, probs=c(.25, .50, .75))
    {
        c(min(x, na.rm=TRUE),
          mean(x, na.rm=TRUE) + qnorm(p=probs, 0, 1)*sd(x, na.rm=TRUE),
          max(x, na.rm=TRUE))
    }

    makeBuckets <- function(x, probs=c(.25, .50, .75))
    {
        theCuts <- getBucketCuts(x, probs=probs)
        cut(x, breaks=theCuts, labels=seq_len(length(probs) + 2 - 1), include.lowest=TRUE)
    }

    dataList[[NewName]] <- makeBuckets(dataList[[col]])

    return(dataList)
}

tester <- dia %>% group_by_(.dots=c('cut', 'color')) %>% mutate(.rxArgs=list(transformFunc=makeBucketDF, transformVars='price', transformObjects=list(col='price', NewName='Bucket')))
head(tester)

rxExecBy in Spark hangs if keys is a named vector

mthc <- dplyrXdf::copy_to_hdfs(mtcars)

# works
rxExecBy(mthc, "cyl", function(keys, data) 0)

# hangs
x <- c(cyl="cyl")
rxExecBy(mthc, x, function(keys, data) 0)