Using sparklyr with an Apache Spark cluster on Rstudio

library(sparklyr)
library(dplyr)
library(ggplot2)

conf <- spark_config()
sc <- spark_connect(master = “yarn-client”,
spark_home = “/usr/hdp/current/spark-client/”,
version = “1.6.2”,
config = conf)
iris_tbl <- copy_to(sc, iris)
flights_tbl <- copy_to(sc, nycflights13::flights, “flights”)
batting_tbl <- copy_to(sc, Lahman::Batting, “batting”)
flights_tbl %>% filter(dep_delay == 2)

https://spark.rstudio.com/examples/cloudera-aws/

 

 

Advertisements