{"version":1,"pages":[{"id":"-M1PNWoglPNLjlN-W3Cb","title":"Preface","pathname":"/data-science-and-apache-spark","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5JKev2jUiYp_RgegMN","title":"Contents","pathname":"/data-science-and-apache-spark/table-of-contents","siteSpaceId":"sitesp_gVJEX","description":"all you need is to just click the links..."},{"id":"-M7aVt0T_tHIYrx1gemr","title":"Basic Prerequisite Skills","pathname":"/data-science-and-apache-spark/basic-prerequisite-skills","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SUa34C-kcxc7d4li8","title":"Computer needed for this course","pathname":"/data-science-and-apache-spark/computer_needed","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SUpayfoqvimQEM3QE","title":"Spark Environment Setup","pathname":"/data-science-and-apache-spark/spark_setup","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SVB08FGhhp4UKoiNK","title":"Dev environment setup, task list","pathname":"/data-science-and-apache-spark/dev_setup4","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SVc16l-iosMF7ha0t","title":"JDK setup","pathname":"/data-science-and-apache-spark/jdk_setup5","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SVrotNFa98xPgVLeO","title":"Download and install Anaconda Python and create virtual environment with Python 3.6","pathname":"/data-science-and-apache-spark/conda_setup6","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SWAu0lZ3dsrjc8p_A","title":"Download and install Spark","pathname":"/data-science-and-apache-spark/download-and-install-spark","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1StqAgNR67MeekYfKb","title":"Eclipse, the Scala IDE","pathname":"/data-science-and-apache-spark/scala-ide","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SuTbNgLcj8zMX05VB","title":"Install findspark, add spylon-kernel for scala","pathname":"/data-science-and-apache-spark/install-findspark-add-spylon-kernel-for-scala","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M7R1cp83KSqgs6Qrssh","title":"ssh and scp client","pathname":"/data-science-and-apache-spark/ssh-and-scp-client","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SugFTOJbT3-u8ZQcM","title":"Summary","pathname":"/data-science-and-apache-spark/summary2","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M67qcrQNkua3MFdEd2M","title":"Development environment on MacOS","pathname":"/data-science-and-apache-spark/development-environment-on-macos","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SusC-azkwjqa_brHG","title":"Production Spark Environment Setup","pathname":"/data-science-and-apache-spark/production-spark-environment-setup","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M6N8HZY6_LUpXpQcQUN","title":"VirtualBox VM","pathname":"/data-science-and-apache-spark/virtualbox-vm","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M6ND87rNib7MeWeHYFp","title":"VirtualBox only shows 32bit on AMD CPU","pathname":"/data-science-and-apache-spark/virtualbox-only-shows-32bit-on-amd-cpu","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M6NGOil5JwxM3p0KU5Q","title":"Configure VirtualBox NAT as Network Adapter on Guest VM and Allow putty ssh Through Port Forwarding","pathname":"/data-science-and-apache-spark/configure-virtualbox-nat-as-network-adapter-on-guest-vm-and-allow-putty-ssh-through-port-forwarding","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Sv40-AVA9BdRh4jOy","title":"Docker deployment of Spark Cluster","pathname":"/data-science-and-apache-spark/docker-deployment-of-spark-cluster","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SvaYUJQ-5Eru9VZ37","title":"Create customized Apache Spark Docker container","pathname":"/data-science-and-apache-spark/create-customized-apache-spark-docker-container","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SvruWeBNeC-Sxd7nd","title":"Dockerfile","pathname":"/data-science-and-apache-spark/untitled-12","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1SxeMyNPw9HMfDF9vO","title":"docker-compose and docker-compose.yml","pathname":"/data-science-and-apache-spark/docker-compose-and-docker-compose.yml","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1TiUYQC3Y90tBdeTqx","title":"Launch custom built Docker container with docker-compose","pathname":"/data-science-and-apache-spark/launch-custom-built-docker-container-with-docker-compose","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5ZcCc8mXW4G7VhEsm_","title":"Entering Docker Container","pathname":"/data-science-and-apache-spark/entering-docker-container","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Tj13OPC2FsZm3J3m_","title":"Setup Hadoop, Hive and Spark on Linux without docker","pathname":"/data-science-and-apache-spark/setup-hadoop-hive-and-spark-on-linux-without-docker","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Tk8AsOUFYj-HpwKg4","title":"Hadoop Preparation","pathname":"/data-science-and-apache-spark/hadoop-configuration","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1TjwT3cr2EDvq4gxLm","title":"Hadoop setup","pathname":"/data-science-and-apache-spark/hadoop-setup","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1TkOqTk705HlAFtQ6h","title":"Configure $HADOOP_HOME/etc/hadoop","pathname":"/data-science-and-apache-spark/configure-usdhadoop_home-etc-hadoop","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Tkpp0USyR_Ghgj5Sg","title":"HDFS","pathname":"/data-science-and-apache-spark/hdfs","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UIxoAjHAFlPuoBzx2","title":"Start and stop Hadoop","pathname":"/data-science-and-apache-spark/start-hadoop","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UJQCCJkVSuXilh-dJ","title":"Work with Hadoop and HDFS file system","pathname":"/data-science-and-apache-spark/work-with-hadoop-and-hdfs-file-system","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UJgmZ_4ZEixwu3yb0","title":"Connect to Hadoop web interface port 50070 and 8088","pathname":"/data-science-and-apache-spark/connect-to-hadoop-web-interface-port-50070","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UJzW6qJLZeaM6ntrd","title":"Install Hive","pathname":"/data-science-and-apache-spark/install-hive","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UKBW3XO0SJ07K7_JD","title":"hive home","pathname":"/data-science-and-apache-spark/hive-home","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UKOZFy04stIaFJZVW","title":"Initialize hive schema","pathname":"/data-science-and-apache-spark/initialize-hive-schema","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UMBISUeHVz3JCCad-","title":"Start hive metastore service.","pathname":"/data-science-and-apache-spark/start-hive-metastore-service.","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M68QNV8TNb26PrbppD_","title":"hive-site.xml","pathname":"/data-science-and-apache-spark/hive-site.xml","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UMMXIO4e1H6pg9YDu","title":"Hive client","pathname":"/data-science-and-apache-spark/hive-client","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UMX3q8bez7wVDSglG","title":"Setup Apache Spark","pathname":"/data-science-and-apache-spark/setup-apache-spark","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UMjQcsAYinvUsmRaG","title":"Spark Home","pathname":"/data-science-and-apache-spark/spark-home","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M6wnABaL4b_kTJwcQhZ","title":"Jupyter-notebook server","pathname":"/data-science-and-apache-spark/jupyter-notebook-server","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UT1brH4wPPqCF_K9e","title":"Python 3 Warm Up","pathname":"/data-science-and-apache-spark/python-3-warm-up","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1UTG_GkQy9sCDvHVeF","title":"Basics","pathname":"/data-science-and-apache-spark/python-basics","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1W2CIdRfKkmnqalm6S","title":"Iterables/Collections","pathname":"/data-science-and-apache-spark/iterables-collections","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1W2S66jHoNL_KTRE2e","title":"Strings","pathname":"/data-science-and-apache-spark/python-strings","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1W2cVJrS-PH2EzYcBf","title":"List","pathname":"/data-science-and-apache-spark/python-list","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XjelKXNxo46F1sAqj","title":"Tuple","pathname":"/data-science-and-apache-spark/python-tuple","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Xjq58ZyBurSW1w_DQ","title":"Dictionary","pathname":"/data-science-and-apache-spark/python-dictionary","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Xk3dMbjb-fsvYOcLM","title":"Set","pathname":"/data-science-and-apache-spark/python-set","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XkIqL_cwLnA94hJxA","title":"Conditional statement","pathname":"/data-science-and-apache-spark/conditional-statement","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XkXl3PUaGAoU58VuL","title":"for loop","pathname":"/data-science-and-apache-spark/loop-statement-for-statement","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M627P7UX-fuVbA89iWM","title":"while loop","pathname":"/data-science-and-apache-spark/while-loop","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XlzMg17u1AAszmgm6","title":"Functions and methods","pathname":"/data-science-and-apache-spark/functions-and-methods","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XmI5htNxpLj3F3NXC","title":"map and filter","pathname":"/data-science-and-apache-spark/map-and-filter","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Xp9ZUx8MSGLWQMUSy","title":"map and filter takes function as input","pathname":"/data-science-and-apache-spark/map-and-filter-takes-function-as-input","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XpOGnxfLRBwu16zxH","title":"lambda","pathname":"/data-science-and-apache-spark/lambda","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XpYsAx6O2epla21aj","title":"Python Class","pathname":"/data-science-and-apache-spark/data-structure","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XpqlfG3oYC6gn8SWY","title":"Input and if statement","pathname":"/data-science-and-apache-spark/input-and-if-statement","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XrvzegmvnLkur-aFG","title":"Input from a file","pathname":"/data-science-and-apache-spark/input-from-a-file","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Xs6ZOrp5rPqs99C-R","title":"Output to a file","pathname":"/data-science-and-apache-spark/output-to-a-file","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5TxX3ahitDokvgPoqy","title":"try except","pathname":"/data-science-and-apache-spark/try-except","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4lnwhTEeIx1iZnn5NR","title":"Python coding exercise","pathname":"/data-science-and-apache-spark/python-coding-excercise","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XsJ1Qix6n6pOnDYGp","title":"Scala Warm Up","pathname":"/data-science-and-apache-spark/scala-warm-up","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M6wDh69zauJILDRhLA4","title":"Start Spylon-kernel on Jupyter-notebook","pathname":"/data-science-and-apache-spark/start-spylon-kernel-on-jupyter-notebook","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XsTEAIIuhv5Zv36pg","title":"Type of Variable: Mutable or immutable","pathname":"/data-science-and-apache-spark/type-of-variable-mutable-or-immutable","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M6cmJYVBbm6r6mm-VUs","title":"Block statement","pathname":"/data-science-and-apache-spark/block-statement","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4mAZG9CNQ6XoRWnk2a","title":"Scala Data Type","pathname":"/data-science-and-apache-spark/scala-data-type","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4m9uUlTk7jCKwMhFA5","title":"Array in Scala","pathname":"/data-science-and-apache-spark/array-in-scala","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XsdnTnOCLSuqNLPyZ","title":"Methods","pathname":"/data-science-and-apache-spark/scala-methods","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M6cn0jtaULjgWJ9JLfH","title":"Functions","pathname":"/data-science-and-apache-spark/functions","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5UNrSXtQYQY3_WA1W9","title":"Anonymous function","pathname":"/data-science-and-apache-spark/anonymous-function","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5UNdW7wRa4p8Fml7PP","title":"Scala map and filter methods","pathname":"/data-science-and-apache-spark/scala-map-and-filter-methods","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XsmZPqWa3HzT6OaCd","title":"Class","pathname":"/data-science-and-apache-spark/scala-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XvbnXVHIaHLnnzBs0","title":"Objects","pathname":"/data-science-and-apache-spark/scala-objects","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Xvkct2BjLTMZJg-ai","title":"Trait","pathname":"/data-science-and-apache-spark/scala-trait","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5SrmhbkSAFYhIBw_HL","title":"Tuple in Scala","pathname":"/data-science-and-apache-spark/tuple-in-scala","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5U0UI7npTSNmMqL_tM","title":"List/Seq","pathname":"/data-science-and-apache-spark/list-seq","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5U9WkfRud3oL08fGBV","title":"Set in Scala","pathname":"/data-science-and-apache-spark/set-in-scala","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5UEro3hgYjClsZo4KB","title":"Scala Map","pathname":"/data-science-and-apache-spark/scala-map","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4mB47NHCoTyo5sKWa_","title":"Scala if statement","pathname":"/data-science-and-apache-spark/scala-if-statement","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4mC145g093nsVM4DEd","title":"Scala for loop","pathname":"/data-science-and-apache-spark/scala-for-loop","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4mCPnNisluX9cAsiTq","title":"Scala While Loop","pathname":"/data-science-and-apache-spark/scala-while-loop","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4mCsOHMhy0Y77UOUmv","title":"Scala Exceptions + try catch finally","pathname":"/data-science-and-apache-spark/scala-exceptions-+-try-catch-finally","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4locqplIpHELmypy4t","title":"Scala coding exercise","pathname":"/data-science-and-apache-spark/scala-coding-excercise","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XxUrOtcAa78rMGOh-","title":"Run a program to estimate pi","pathname":"/data-science-and-apache-spark/run-a-program-to-estimate-pi","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M6723nR2QswAajqdr-N","title":"Common Spark command line","pathname":"/data-science-and-apache-spark/spark-command-line","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XxksRm8umRDCrZ8dA","title":"Run Scala code with spark-submit","pathname":"/data-science-and-apache-spark/run-scala-code-with-apache-spark","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Xy3P9nl3VN43XNEkM","title":"Python with Apache Spark using Jupyter notebook","pathname":"/data-science-and-apache-spark/python-with-apache-spark-using-jupyter-notebook","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4eZlyPPYw9g_p_Hn2A","title":"Spark Core Introduction","pathname":"/data-science-and-apache-spark/spark-core-introduction","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4fJ8goqK129Xn0Ci9U","title":"Spark and Scala Version","pathname":"/data-science-and-apache-spark/spark-and-scala-version","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4fJryur7HKtzSKIiim","title":"Basic Spark Package","pathname":"/data-science-and-apache-spark/basic-spark-package","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4fLTNmATzoBL4sf6lb","title":"Resilient Distributed Datasets (RDDs)","pathname":"/data-science-and-apache-spark/resilient-distributed-datasets-rdds","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4feYCE4fcNKcrCCLWh","title":"RDD Operations","pathname":"/data-science-and-apache-spark/rdd-operations","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4fpVy2wmexM2JwskwM","title":"Passing Function to Spark","pathname":"/data-science-and-apache-spark/passing-function-to-spark","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4fsGlxgfwUSwjUGDCd","title":"Printing elements of an RDD","pathname":"/data-science-and-apache-spark/printing-elements-of-an-rdd","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4fw15GAgKqcjH2jXyw","title":"Working with key value pair","pathname":"/data-science-and-apache-spark/working-with-key-value-pair","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4fyk8QCEjTONK8yT4b","title":"RDD Transformation Functions","pathname":"/data-science-and-apache-spark/rdd-transformation-funcitons","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4g-A8qygro1s3MyWRd","title":"RDD Action Functions","pathname":"/data-science-and-apache-spark/rdd-action-functions","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XyjoyWI-19yn2X_p5","title":"SPARK SQL","pathname":"/data-science-and-apache-spark/untitled-57","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4g0xMPHiyjky84Ovxp","title":"SQL","pathname":"/data-science-and-apache-spark/sql","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4g1VGrTcOygU_tLhj3","title":"Datasets and DataFrames","pathname":"/data-science-and-apache-spark/datasets-and-dataframes","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4ga7RlSE96Yt8vuds7","title":"SparkSession","pathname":"/data-science-and-apache-spark/sparksession","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4gb8N9nrX_UsmHAXOv","title":"Creating DataFrames","pathname":"/data-science-and-apache-spark/creating-dataframes","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4gdDNMDRufbK9zJTWf","title":"Running SQL Queries Programmatically","pathname":"/data-science-and-apache-spark/running-sql-queries-programmatically","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M8d4sh7xO1jjCLlHCno","title":"Issue from running Cartesian Join Query","pathname":"/data-science-and-apache-spark/issue-from-running-cartesian-join-query","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4gm1L8JGqry9K-q1Ej","title":"Creating Datasets","pathname":"/data-science-and-apache-spark/creating-datasets","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4gqAkhiOt6Te967mBc","title":"Interoperating with RDD","pathname":"/data-science-and-apache-spark/interoperating-with-rdd","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4gyFgSpAeaw1Zju-o8","title":"Untyped User-Defined Aggregate Functions","pathname":"/data-science-and-apache-spark/untyped-user-defined-aggregate-functions","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4h5r_avILnIFcOTTHW","title":"Generic Load/Save Functions","pathname":"/data-science-and-apache-spark/generic-load-save-functions","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4h7xUP1eFdlCZ0CQta","title":"Manually specify file option","pathname":"/data-science-and-apache-spark/manually-specify-file-option","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4h9G2STknS0RIUElHw","title":"Run SQL on files directly","pathname":"/data-science-and-apache-spark/run-sql-on-files-directly","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4hC6gtpzIReDeDSqqZ","title":"Save Mode","pathname":"/data-science-and-apache-spark/save-mode","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4hD97pd7C4bN2HBXdK","title":"Saving to Persistent Tables","pathname":"/data-science-and-apache-spark/saving-to-persistent-tables","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4hDQkcss0FBnpi4J61","title":"Bucketing, Sorting and Partitioning","pathname":"/data-science-and-apache-spark/bucketing-sorting-and-partitioning","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4jXinhyf7W37uPtJ1l","title":"Apache Arrow","pathname":"/data-science-and-apache-spark/apache-arrow","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4jY6D1RuDsCez_9_eg","title":"Install Python Arrow Module PyArrow","pathname":"/data-science-and-apache-spark/install-python-arrow-module-pyarrow","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4jky5P8gyTLA4QovYz","title":"Issue might happen import PyArrow","pathname":"/data-science-and-apache-spark/issue-might-happen-import-pyarrow","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4jYs5z2Bl9UNShsiaV","title":"Enabling for Conversion to/from Pandas in Python","pathname":"/data-science-and-apache-spark/enabling-for-conversion-to-from-pandas","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XyxYTC2TonjYJIg-J","title":"Connect to any data source the same consistent way","pathname":"/data-science-and-apache-spark/connect-to-any-data-source-the-same-consistent-way","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1XzFAz6dOuh3bdahC2","title":"Spark SQL Implementation Example in Scala","pathname":"/data-science-and-apache-spark/spark-sql-implementation-example-in-scala","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y0av8WD9kDpO8R418","title":"Run scala code in Eclipse IDE","pathname":"/data-science-and-apache-spark/run-scala-code-in-eclipse-ide","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y0ns_sqB-of6aWN_b","title":"Hive Integration, run SQL or HiveQL queries on existing warehouses.","pathname":"/data-science-and-apache-spark/hive-integration-run-sql-or-hiveql-queries-on-existing-warehouses.","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1vG6MZm65ulOcvCcn4","title":"Example: Enrich JSON","pathname":"/data-science-and-apache-spark/enrich-json","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5zOQVnwFBkAZQd6cWq","title":"Integrate Tableau Data Visualization with Hive Data Warehouse and Apache Spark SQL","pathname":"/data-science-and-apache-spark/integrate-tableau-data-visualization-with-hive-data-warehouse-and-apache-spark-sql","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M915SYgt0jeKHTD_qIL","title":"Connect Tableau to Spark SQL running in VM with VirtualBox with NAT","pathname":"/data-science-and-apache-spark/connect-tableau-to-spark-sql-running-in-vm-with-virtualbox-with-nat","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M9FUEoftIVzxd6QDeAH","title":"Issues with connecting from Tableau to Spark SQL","pathname":"/data-science-and-apache-spark/issues-with-connecting-from-tableau-to-spark-sql","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y2fY1RuDQ9T4-Yd30","title":"SPARK Streaming","pathname":"/data-science-and-apache-spark/spark-streaming","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y2yUf0iDajYOVtm8x","title":"Discretized Streams (DStreams)","pathname":"/data-science-and-apache-spark/discretized-streams-dstreams","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y3BBzzFWXBL1rmtoO","title":"Transformations on DStreams","pathname":"/data-science-and-apache-spark/transformations-on-dstreams","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y3NlVg7yRk9xu2FKI","title":"map(func)","pathname":"/data-science-and-apache-spark/map-func","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y3XdxsgJihWEU7eRX","title":"filter(func)","pathname":"/data-science-and-apache-spark/filter-func","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y3fnvDAKTLQCJv-u8","title":"repartition(numPartitions)","pathname":"/data-science-and-apache-spark/repartition-numpartitions","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y3sDJvpfC_sVANBzw","title":"union(otherStream)","pathname":"/data-science-and-apache-spark/union-otherstream","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y6ap0VcBWPg6dWsdx","title":"reduce(func)","pathname":"/data-science-and-apache-spark/reduce-func","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y4F7i3YOx0ZfJL9-2","title":"count()","pathname":"/data-science-and-apache-spark/stream-count","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y6mJ807EJHHubOl-R","title":"countByValue()","pathname":"/data-science-and-apache-spark/countbyvalue","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y70UCfysQL-lFF09q","title":"reduceByKey(func, [numTasks])","pathname":"/data-science-and-apache-spark/reducebykey-func-numtasks","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y7BaztmO6lvFBcYzX","title":"join(otherStream, [numTasks])","pathname":"/data-science-and-apache-spark/join-otherstream-numtasks","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y7Q05UAMk1jyV2zsC","title":"cogroup(otherStream, [numTasks])","pathname":"/data-science-and-apache-spark/cogroup-otherstream-numtasks","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y7_FJ2Mg8H65H5Lo5","title":"transform(func)","pathname":"/data-science-and-apache-spark/transform-func","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y7qfNLldTXRbx-cFU","title":"updateStateByKey(func)","pathname":"/data-science-and-apache-spark/updatestatebykey-func","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M8D7q_7MIZvrcwMQYj0","title":"Scala Tips for updateStateByKey","pathname":"/data-science-and-apache-spark/scala-tips-for-updatestatebykey","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y7zqd-fz7tFmkY9Z4","title":"repartition(numPartitions)","pathname":"/data-science-and-apache-spark/repartition-numpartitions-1","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y8ZSzJZJa-jiHUHAZ","title":"DStream Window Operations","pathname":"/data-science-and-apache-spark/dstream-window-operations","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y90x9C46S9ClYjTo5","title":"DStream Window Transformation","pathname":"/data-science-and-apache-spark/dstream-window-transformation","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y9NZmGmypDJRIJLag","title":"countByWindow(windowLength, slideInterval)","pathname":"/data-science-and-apache-spark/countbywindow-windowlength-slideinterval","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1Y9iRi_0QS2uduotKP","title":"reduceByWindow(func, windowLength, slideInterval)","pathname":"/data-science-and-apache-spark/reducebywindow-func-windowlength-slideinterval","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YBkP_YmQHYHXCb6Bz","title":"reduceByKeyAndWindow(func, windowLength, slideInterval, [numTasks])","pathname":"/data-science-and-apache-spark/reducebykeyandwindow-func-windowlength-slideinterval-numtasks","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YBxowUgC55O-PPl5-","title":"reduceByKeyAndWindow(func, invFunc, windowLength, slideInterval, [numTasks])","pathname":"/data-science-and-apache-spark/reducebykeyandwindow-func-invfunc-windowlength-slideinterval-numtasks","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YCDlH9MuQeq5ISLbJ","title":"countByValueAndWindow(windowLength, slideInterval, [numTasks])","pathname":"/data-science-and-apache-spark/countbyvalueandwindow-windowlength-slideinterval-numtasks","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YCR1vbygeHD8gxA_P","title":"window(windowLength, slideInterval)","pathname":"/data-science-and-apache-spark/window-windowlength-slideinterval","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YElBOzgMiX1Arxc8s","title":"Window DStream print(n)","pathname":"/data-science-and-apache-spark/window-dstream-print-n","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YF-0BkGtm5VuVFCUg","title":"saveAsTextFiles(prefix, [suffix])","pathname":"/data-science-and-apache-spark/untitled-92","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YF98oIabYgeqMpEK4","title":"saveAsObjectFiles(prefix, [suffix])","pathname":"/data-science-and-apache-spark/saveasobjectfiles-prefix-suffix","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YFJ7PnDhppr289-CW","title":"saveAsHadoopFiles(prefix, [suffix])","pathname":"/data-science-and-apache-spark/saveashadoopfiles-prefix-suffix","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YFUZ3Bg-FTrMzBBrS","title":"foreachRDD(func)","pathname":"/data-science-and-apache-spark/foreachrdd-func","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M9eDaYFfwFSl04VI0av","title":"Build Twitter Scala API Library for Spark Streaming using sbt","pathname":"/data-science-and-apache-spark/build-twitter-api-library-for-spark-streaming-using-sbt","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YI91DbFWeaoH59H_B","title":"Spark Streaming with Twitter, you can get public tweets by using Twitter API.","pathname":"/data-science-and-apache-spark/spark-streaming-with-twitter-you-can-get-public-tweets-by-using-twitter-api.","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YIWKKEmg3eX_mFyYm","title":"Spark streaming use case with Python","pathname":"/data-science-and-apache-spark/spark-streaming-use-case-with-python","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3j0bPLNEGeXh46qMp1","title":"Spark Graph Computing","pathname":"/data-science-and-apache-spark/spark-graph-computing","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M47iljK9dM1AyCoJJvQ","title":"Spark Graph Computing Continue","pathname":"/data-science-and-apache-spark/spark-graph-computing-continue","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3JS8ccqqzy7VjDOk12","title":"Graphx","pathname":"/data-science-and-apache-spark/graphx-1","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3A8lVNQUxT4R5fKJEU","title":"Package org.apache.spark.graphx","pathname":"/data-science-and-apache-spark/graphx","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3DDrdFkxn93y9wQsoX","title":"Edge Class","pathname":"/data-science-and-apache-spark/edge-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3DFi9eTczzsopAEyv1","title":"EdgeContext Class","pathname":"/data-science-and-apache-spark/edgecontext-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3DPSlxLRHKqFvx4Gne","title":"EdgeDirection Class","pathname":"/data-science-and-apache-spark/edgedirection-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3DR6T9xWR6bBoF3KFr","title":"EdgeRDD Class","pathname":"/data-science-and-apache-spark/edgerdd-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3DUeTgrKrqTJklHt5H","title":"EdgeTriplet Class","pathname":"/data-science-and-apache-spark/edgetriplet-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3DYR7mYeOQwiog_hfz","title":"Graph Class","pathname":"/data-science-and-apache-spark/graph-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3EoVJUqPRROnIN0l3s","title":"GraphLoader Object","pathname":"/data-science-and-apache-spark/graphloader-object","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3EpmD6PKqaIEE7VU0E","title":"GraphOps Class","pathname":"/data-science-and-apache-spark/graphops-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3F548-U0UIwSm4P4UG","title":"GraphXUtils Object","pathname":"/data-science-and-apache-spark/graphxutils-object","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3F5u9NzjbC-c3svauS","title":"PartitionStrategy Trait","pathname":"/data-science-and-apache-spark/partitionstrategy-trait","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3F6F6GMX7hIW-Ealo1","title":"Pregel Object","pathname":"/data-science-and-apache-spark/pregel-object","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3F7NcUIyNwnFyvz5VZ","title":"TripletFields  Class","pathname":"/data-science-and-apache-spark/tripletfields-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3F7n7WZ6iQqQ-mW4v_","title":"VertexRDD Class","pathname":"/data-science-and-apache-spark/vertexrdd-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3HxF07naTRKH1Cv97X","title":"Package org.apache.spark.graphx.impl","pathname":"/data-science-and-apache-spark/untitled","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3Hyb8iFMGOpareZ_cF","title":"AggregatingEdgeContext Class","pathname":"/data-science-and-apache-spark/aggregatingedgecontext-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3I-W5Dth9ldxv4mo0a","title":"EdgeRDDImpl Class","pathname":"/data-science-and-apache-spark/edgerddimpl-class","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3IVS0eYFqe6nlqmtzb","title":"Class GraphImpl<VD,ED>","pathname":"/data-science-and-apache-spark/class-graphimpl-less-than-vd-ed-greater-than","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3In-Q4yoLFLPvJun8_","title":"Class VertexRDDImpl<VD>","pathname":"/data-science-and-apache-spark/class-vertexrddimpl-less-than-vd-greater-than","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3J2Pm3sYTkM6sQJYES","title":"Package org.apache.spark.graphx.lib","pathname":"/data-science-and-apache-spark/package-org.apache.spark.graphx.lib-1","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3J3C5kr_GkB4qh8xsK","title":"Class ConnectedComponents","pathname":"/data-science-and-apache-spark/class-connectedcomponents","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3J3mh11UPgHtD2SJYR","title":"Class LabelPropagation","pathname":"/data-science-and-apache-spark/class-labelpropagation","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3JKSewAzBMpzk6gvx0","title":"Class PageRank","pathname":"/data-science-and-apache-spark/class-pagerank","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3JMnbD34zbuCqIqslQ","title":"Class ShortestPaths","pathname":"/data-science-and-apache-spark/class-shortestpaths","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3JNl6bSXY8PxqZeWrp","title":"Class StronglyConnectedComponents","pathname":"/data-science-and-apache-spark/class-stronglyconnectedcomponents","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3JPTv8mvLkw72sj7Hp","title":"Class SVDPlusPlus","pathname":"/data-science-and-apache-spark/class-svdplusplus","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3JQJY1P4GQoRXya6Kx","title":"Class SVDPlusPlus.Conf","pathname":"/data-science-and-apache-spark/class-svdplusplus.conf","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3JQzTlBZTBOry8kTEo","title":"Class TriangleCount","pathname":"/data-science-and-apache-spark/class-trianglecount","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3IpA7dftJgZ6eOSEfZ","title":"Package org.apache.spark.graphx.util","pathname":"/data-science-and-apache-spark/package-org.apache.spark.graphx.lib","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3IvdAtX9BVlixBHa-k","title":"Class BytecodeUtils","pathname":"/data-science-and-apache-spark/class-bytecodeutils","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3IwQdU2Y7rxkQASIHG","title":"Class GraphGenerators","pathname":"/data-science-and-apache-spark/class-graphgenerators","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3FHf6q5SeER4b3Goas","title":"Graphx Example 1","pathname":"/data-science-and-apache-spark/graphx-examples","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3FSDpLqzGCczJDPzkt","title":"Graphx Example 2","pathname":"/data-science-and-apache-spark/graphx-example-2","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3XNTplDbvWNlaCxm0N","title":"Graphx Example 3","pathname":"/data-science-and-apache-spark/graphx-example-3","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YImAvDbrBNokwiFnk","title":"Spark Graphx Describes Organization Chart Easy and Fast","pathname":"/data-science-and-apache-spark/untitled-98","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4boG5bidiI32f7Sl-I","title":"Page Rank with Apache Spark Graphx","pathname":"/data-science-and-apache-spark/graphx-application-case-2","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4bzJtRO4NzTaDHJgSh","title":"bulk synchronous parallel with Google Pregel Graphx Implementation Use Cases","pathname":"/data-science-and-apache-spark/graphx-application-case-3","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4viHjzfdoU4QOm4Myv","title":"Tree and Graph Traversal with and without Spark Graphx","pathname":"/data-science-and-apache-spark/tree-and-graph-traversal","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-MBBMzHwPFrPUk5vQBdE","title":"Graphx Graph Traversal with Pregel Explained","pathname":"/data-science-and-apache-spark/graphx-graph-traversal-with-pregel-explained","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YMWNwogzmfM5XjO8q","title":"Spark Machine Learning","pathname":"/data-science-and-apache-spark/spark-machine-learning","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YMeIpgBVswz7jDlHv","title":"Binary Classification","pathname":"/data-science-and-apache-spark/binary-classification","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YMmmwgvjwsPJBycZo","title":"Multiclass Classification","pathname":"/data-science-and-apache-spark/multiclass-classification","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YN-vRImAwLIYqndZT","title":"Regression","pathname":"/data-science-and-apache-spark/regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YNBpkaeiG5tl5zz-N","title":"Correlation","pathname":"/data-science-and-apache-spark/correlation","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1xyATk_aMZ07lpDU1K","title":"Image Data Source","pathname":"/data-science-and-apache-spark/image-data-source","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YOxxP4Ffh68ztoxpY","title":"ML DataFrame is SQL DataFrame","pathname":"/data-science-and-apache-spark/ml-dataframe","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YPDIzQUj9wwtQnD8S","title":"ML Transformer","pathname":"/data-science-and-apache-spark/ml-transformer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YPT--v4fn8rOrGdQW","title":"ML Estimator","pathname":"/data-science-and-apache-spark/ml-estimator","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YPkG7W9Ab5GrbcZ0O","title":"ML Pipeline","pathname":"/data-science-and-apache-spark/ml-pipeline","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YPsX1zNsyE2H-Rdvc","title":"Transformer/Estimator Parameters","pathname":"/data-science-and-apache-spark/transformer-estimator-parameters","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YQ3ESQaqTU_oiBdoC","title":"Extracting, transforming and selecting features","pathname":"/data-science-and-apache-spark/extracting-transforming-and-selecting-features","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YQtFSFHfWPL11BWt5","title":"TF-IDF","pathname":"/data-science-and-apache-spark/tf-idf","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2lQYEdDUVbLk285GUc","title":"Word2Vec","pathname":"/data-science-and-apache-spark/word2vec","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2lTjN2IZxAK6WiwI-B","title":"FeatureHasher","pathname":"/data-science-and-apache-spark/featurehasher","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2lVocdaq5f0C4bGQD_","title":"Tokenizer","pathname":"/data-science-and-apache-spark/tokenizer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M53_rxReQvUCaE-4Z5S","title":"CountVectorizer","pathname":"/data-science-and-apache-spark/countvectorizer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2lYbulKtkOcsyOkVNu","title":"StopWordRemover","pathname":"/data-science-and-apache-spark/stopwordremover","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2pJOnnp9AzxDjszaLo","title":"n-gram","pathname":"/data-science-and-apache-spark/n-gram","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2pM9rfYZY7o-Nu-riU","title":"Binarizer","pathname":"/data-science-and-apache-spark/binarizer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YRUsJfusDnZ2vuceK","title":"PCA","pathname":"/data-science-and-apache-spark/pca","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2pRNq7rmLISSvtxyXB","title":"PolynomialExpansion","pathname":"/data-science-and-apache-spark/polynomialexpansion","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2pTeiRfRUbvu9_--lv","title":"StringIndexer","pathname":"/data-science-and-apache-spark/stringindexer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2pY8jUxWR0XlaRwlX2","title":"Discrete Cosine Transform (DCT)","pathname":"/data-science-and-apache-spark/discrete-cosine-transform-dct","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YRh6ej36jQJ1PGjEN","title":"One-hot encoding","pathname":"/data-science-and-apache-spark/one-hot-encoding","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YRxF6xCV2sy7O5Bg9","title":"StandardScaler","pathname":"/data-science-and-apache-spark/standardscaler","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2pVqf3D5O5iDWbn4Pu","title":"IndexToString","pathname":"/data-science-and-apache-spark/indextostring","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2psmwQIuCRNZvrAMvH","title":"VectorIndexer","pathname":"/data-science-and-apache-spark/vectorindexer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2qMN_5GHKFY_1jR-_G","title":"Interaction","pathname":"/data-science-and-apache-spark/interaction","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2qNyl1dfYEoeydpA9n","title":"Normalizer","pathname":"/data-science-and-apache-spark/normalizer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YS8qFvbfeehKT4DFc","title":"MinMaxScaler","pathname":"/data-science-and-apache-spark/minmaxscaler","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2qYlsYXjn01ideFbR_","title":"MaxAbScaler","pathname":"/data-science-and-apache-spark/maxabscaler","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YSOpPCpyaZxXtMvGK","title":"Bucketizer","pathname":"/data-science-and-apache-spark/bucketizer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2qcujqoox0TJb344Yi","title":"ElementwiseProduct","pathname":"/data-science-and-apache-spark/elementwiseproduct","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YSdncoqsHS1MxLR7x","title":"SQLTransformer","pathname":"/data-science-and-apache-spark/sqltransformer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YSoqGxROId_QHKBkZ","title":"VectorAssembler","pathname":"/data-science-and-apache-spark/vectorassembler","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2qw__SkJHmJJk-fyIk","title":"VectorSizeHint","pathname":"/data-science-and-apache-spark/vectorsizehint","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YVTjnPyzd1WrqhcHW","title":"QuantileDiscretizer","pathname":"/data-science-and-apache-spark/quantilediscretizer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2sov2L3NPox6Y7c2z4","title":"Imputer","pathname":"/data-science-and-apache-spark/imputer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YVe1H0bXM8WZMIvT2","title":"VectorSlicer","pathname":"/data-science-and-apache-spark/vectorslicer","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2ssRIwbHVPLcFS1T4f","title":"RFormula","pathname":"/data-science-and-apache-spark/rformula","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YVohvq3UnSZCpU1FX","title":"ChiSqSelector","pathname":"/data-science-and-apache-spark/chisqselector","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2uPsMz5aKFHlK8ewtH","title":"Locality Sensitive Hashing","pathname":"/data-science-and-apache-spark/locality-sensitive-hashing","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2uT_ABZ23l8_EKEmVA","title":"MinHash for Jaccard Distance","pathname":"/data-science-and-apache-spark/minhash-for-jaccard-distance","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2uW-LAoKGB7HEMMuFO","title":"Classification and Regression","pathname":"/data-science-and-apache-spark/classification-and-regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M2u_8Fmn941A6LG1ieh","title":"LogisticRegression","pathname":"/data-science-and-apache-spark/logisticregression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YWI5kzD8lxKshlcKc","title":"OneVsRest","pathname":"/data-science-and-apache-spark/onevsrest","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YWUsxW8ZhrR_hm6nE","title":"Naive Bayes classifiers","pathname":"/data-science-and-apache-spark/naive-bayes-classifiers","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YWg60nEbOI1kMayo1","title":"Decision trees","pathname":"/data-science-and-apache-spark/decision-trees","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YWmjCDoGJSXnpbneW","title":"Random forests","pathname":"/data-science-and-apache-spark/random-forests","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YWtsFKxmTuPMcHSQ9","title":"Gradient-boosted trees (GBTs)","pathname":"/data-science-and-apache-spark/gradient-boosted-trees-gbts","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M33PU2UQ3NMc85pIZHv","title":"Multilayer perceptron classifier","pathname":"/data-science-and-apache-spark/multilayer-perceptron-classifier","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M33Sv_OeDxL3EwP6Ysf","title":"Linear Support Vector Machine","pathname":"/data-science-and-apache-spark/linear-support-vector-machine","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M33vurQAOdyX8eEoH9b","title":"Linear Regression","pathname":"/data-science-and-apache-spark/linear-regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M33xmE23bkPthbUGKtN","title":"Generalized linear regression","pathname":"/data-science-and-apache-spark/generalized-linear-regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YYMgY3000TpGt7bTf","title":"Isotonic regression","pathname":"/data-science-and-apache-spark/isotonic-regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M33zaRbqqPh9AmbAdHN","title":"Decision Tree Regression","pathname":"/data-science-and-apache-spark/decision-tree-regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M34-yd2olPDU_5uq6ro","title":"Random Forest Regression","pathname":"/data-science-and-apache-spark/random-forest-regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M344XzUUEZKTs-IRQdy","title":"Gradient-boosted tree regression","pathname":"/data-science-and-apache-spark/gradient-boosted-tree-regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M349m0SMKMiS1Qumxp1","title":"Survival regression","pathname":"/data-science-and-apache-spark/survival-regression","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M34XBANlmJLP_PDnebi","title":"Clustering","pathname":"/data-science-and-apache-spark/clustering","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YY_SPTv-Ntn5oWEDN","title":"k-means","pathname":"/data-science-and-apache-spark/k-means","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YYj_nrGq-v1fLHUTg","title":"Latent Dirichlet allocation or LDA","pathname":"/data-science-and-apache-spark/latent-dirichlet-allocation-or-lda","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YYwIzw7PYBLtgWwsC","title":"Bisecting k-means","pathname":"/data-science-and-apache-spark/bisecting-k-means","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M1YZ4-fzgCYOJI6riPU","title":"A Gaussian Mixture Model","pathname":"/data-science-and-apache-spark/a-gaussian-mixture-model","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M34ebUBPYOq0wl-shif","title":"Collaborative filtering","pathname":"/data-science-and-apache-spark/collaborative-filtering","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M350suh8UCo78b5xhsK","title":"Frequent Pattern Mining","pathname":"/data-science-and-apache-spark/frequent-pattern-mining","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M3517Fjp0tkYPM6DWtE","title":"FP-Growth","pathname":"/data-science-and-apache-spark/fp-growth","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M352VLqlzBmRVjkYSK9","title":"PrefixSpan","pathname":"/data-science-and-apache-spark/prefixspan","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M353swLtPJIEQ7n4MV7","title":"ML Tuning: model selection and hyperparameter tuning","pathname":"/data-science-and-apache-spark/ml-tuning-model-selection-and-hyperparameter-tuning","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M354FEEDQenZ7Msnlp4","title":"Model selection (a.k.a. hyperparameter tuning)","pathname":"/data-science-and-apache-spark/model-selection-a.k.a.-hyperparameter-tuning","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M354jrt-5OPzMr3iWU9","title":"Cross-Validation","pathname":"/data-science-and-apache-spark/cross-validation","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M356eCbzxV-UgxCl3dl","title":"Train-Validation Split","pathname":"/data-science-and-apache-spark/train-validation-split","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M37S3UnayfauYo34F58","title":"Spark Machine Learning Applications","pathname":"/data-science-and-apache-spark/spark-machine-learning-applications","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M8RRz6-Cx8yzSQNL6XQ","title":"Apache Spark SQL & Machine Learning on Genetic Variant Classifications","pathname":"/data-science-and-apache-spark/apache-spark-sql-and-machine-learning-on-genetic-variant-classifications","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M37Tr7FPxXMdpjmvI9d","title":"Data Visualization with Vegas Viz and Scala with Spark ML","pathname":"/data-science-and-apache-spark/data-visualization-with-vegas-viz-and-scala-with-spark-ml","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M37sKwqiyb39vvIGTgm","title":"Apache Spark Machine Learning with Dremio Data Lake Engine","pathname":"/data-science-and-apache-spark/apache-spark-machine-learning-with-dremio-data-lake-engine","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M38e1RvkwHA8B-pVhtr","title":"Dremio Data Lake Engine Apache Arrow Flight Connector with Spark Machine Learning","pathname":"/data-science-and-apache-spark/dremio-data-lake-engine-apache-arrow-flight-connector-with-spark-machine-learning","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5G93yt3wxZtT7DPlXK","title":"Neural Network with Apache Spark Machine Learning Multilayer Perceptron Classifier","pathname":"/data-science-and-apache-spark/neural-network-with-apache-spark-machine-learning-multilayer-perceptron-classifier","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-MBH2JIcvzFtQqan2E7v","title":"Setup TensorFlow, Keras, Theano, Pytorch/torchvision on the CentOS VM","pathname":"/data-science-and-apache-spark/setup-tensorflow-keras-theano-pytorch-torchvision-on-the-centos-vm","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M7-xjOOgtzL8eLtw59x","title":"Virus Xray Image Classification with Tensorflow Keras Python and Apache Spark Scala","pathname":"/data-science-and-apache-spark/virus-xray-image-classification-with-tensorflow-keras-python-and-apache-spark-scala","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M5KJdkFXeTtSEkTw0fO","title":"Appendix -- Video Presentations","pathname":"/data-science-and-apache-spark/appendix-video-presentations","siteSpaceId":"sitesp_gVJEX","description":""},{"id":"-M4g9l1HYQKf-gMME5n0","title":"References","pathname":"/data-science-and-apache-spark/references","siteSpaceId":"sitesp_gVJEX","description":""}]}