二分类:SVMs,logistic regression,decision trees,random forests,gradient-boosted trees,naive Bayes

多分类:             logistic regression,decision trees,random forests,                                        naive Bayes

SRE实战 互联网时代守护先锋,助力企业售后服务体系运筹帷幄!一键直达领取阿里云限量特价优惠。

归回:      linear least regression,    decision tress,random forests,gradient-boosted trees,                       isotonic regression。

 

一。Linear models

  spark MLlib Classification and regression 学习 随笔 第1张

   spark MLlib Classification and regression 学习 随笔 第2张

    spark MLlib Classification and regression 学习 随笔 第3张

  classification (SVMs, logistic regression)

   

spark MLlib Classification and regression 学习 随笔 第4张
 1 package ML.ClassificationAndRegression;
 2 
 3 import org.apache.spark.SparkConf;
 4 import org.apache.spark.api.java.JavaRDD;
 5 import org.apache.spark.api.java.JavaSparkContext;
 6 import org.apache.spark.api.java.function.Function;
 7 import org.apache.spark.mllib.classification.SVMModel;
 8 import org.apache.spark.mllib.classification.SVMWithSGD;
 9 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
10 import org.apache.spark.mllib.optimization.L1Updater;
11 import org.apache.spark.mllib.regression.LabeledPoint;
12 import org.apache.spark.mllib.util.MLUtils;
13 import org.apache.spark.rdd.RDD;
14 import scala.Tuple2;
15 
16 /**
17  * TODO
18  *
19  * @ClassName: SVMClassifier
20  * @author: DingH
21  * @since: 2019/4/9 10:28
22  */
23 public class SVMClassifier {
24     public static void main(String[] args) {
25         SparkConf conf = new SparkConf().setAppName("SVM Classifier Example").setMaster("local");
26         JavaSparkContext jsc = new JavaSparkContext(conf);
27         String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
28         JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
29 
30         // Split initial RDD into two... [60% training data, 40% testing data].
31         JavaRDD<LabeledPoint> train = data.sample(false, 0.6, 11L);
32         train.cache();
33         final JavaRDD<LabeledPoint> test = data.subtract(train);
34 
35         //Run training algorithm to build the model
36         int numsIterations = 100;
37         SVMWithSGD svm = new SVMWithSGD();
38         svm.optimizer().setNumIterations(200).setRegParam(0.01).setUpdater(new L1Updater());
39         final SVMModel model1 = svm.run(train.rdd());
40 //        final SVMModel model1 = SVMWithSGD.train(train.rdd(), numsIterations);
41 
42         model1.clearThreshold();
43 
44         JavaRDD<Tuple2<Object, Object>> scoraAndLables = test.map(new Function<LabeledPoint, Tuple2<Object, Object>>() {
45             public Tuple2<Object, Object> call(LabeledPoint p) throws Exception {
46                 double predict = model1.predict(p.features());
47                 return new Tuple2<Object, Object>(predict, p.label());
48             }
49         });
50 
51         BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(scoraAndLables.rdd());
52 
53         double areaUnderROC = metrics.areaUnderROC();
54 
55         System.out.println("Area under ROC = " + areaUnderROC);
56 
57         model1.save(jsc.sc(),"D:\\IdeaProjects\\SimpleApp\\src\\main\\java\\MLModel");
58         SVMModel model = SVMModel.load(jsc.sc(), "D:\\IdeaProjects\\SimpleApp\\src\\main\\java\\MLModel");
59 
60     }
61 }
SVMClassifier spark MLlib Classification and regression 学习 随笔 第6张
 1 package ML.ClassificationAndRegression;
 2 
 3 import org.apache.spark.SparkConf;
 4 import org.apache.spark.api.java.JavaRDD;
 5 import org.apache.spark.api.java.JavaSparkContext;
 6 import org.apache.spark.api.java.function.Function;
 7 import org.apache.spark.ml.classification.MultiClassSummarizer;
 8 import org.apache.spark.mllib.classification.LogisticRegressionModel;
 9 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
10 import org.apache.spark.mllib.classification.LogisticRegressionWithSGD;
11 import org.apache.spark.mllib.evaluation.MulticlassMetrics;
12 import org.apache.spark.mllib.regression.LabeledPoint;
13 import org.apache.spark.mllib.util.MLUtils;
14 import scala.Tuple2;
15 
16 /**
17  * TODO
18  *
19  * @ClassName: LogistiRegression
20  * @author: DingH
21  * @since: 2019/4/9 11:08
22  */
23 public class LogistiRegression {
24     public static void main(String[] args) {
25         SparkConf conf = new SparkConf().setMaster("local").setAppName("LogisticRegression");
26         JavaSparkContext jsc = new JavaSparkContext(conf);
27         String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
28         JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
29 
30         JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.6, 0.4}, 11L);
31         JavaRDD<LabeledPoint> training = split[0].cache();
32         final JavaRDD<LabeledPoint> test = split[1];
33 
34         final LogisticRegressionModel model = new LogisticRegressionWithLBFGS().setNumClasses(10).run(training.rdd());
35         JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(new Function<LabeledPoint, Tuple2<Object, Object>>() {
36             public Tuple2<Object, Object> call(LabeledPoint labeledPoint) throws Exception {
37                 double predict = model.predict(labeledPoint.features());
38                 return new Tuple2<Object, Object>(predict, labeledPoint.label());
39             }
40         });
41 
42         MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
43 
44         double precision = metrics.precision();
45         System.out.println("Precision = " + precision);
46 
47         // Save and load model
48 //        model.save(jsc.sc(), "myModelPath");
49 //        LogisticRegressionModel sameModel = LogisticRegressionModel.load(jsc.sc(), "myModelPath");
50     }
51 }
LogistiRegression

 

  linear regression (least squares, Lasso, ridge)

 

spark MLlib Classification and regression 学习 随笔 第8张
 1 package ML.ClassificationAndRegression;
 2 
 3 import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
 4 import org.apache.spark.SparkConf;
 5 import org.apache.spark.api.java.JavaDoubleRDD;
 6 import org.apache.spark.api.java.JavaRDD;
 7 import org.apache.spark.api.java.JavaSparkContext;
 8 import org.apache.spark.api.java.function.Function;
 9 import org.apache.spark.mllib.linalg.Vectors;
10 import org.apache.spark.mllib.regression.LabeledPoint;
11 import org.apache.spark.mllib.regression.LinearRegressionModel;
12 import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
13 import org.apache.spark.mllib.util.MLUtils;
14 import scala.Tuple2;
15 
16 /**
17  * TODO
18  *
19  * @ClassName: Regression
20  * @author: DingH
21  * @since: 2019/4/9 11:21
22  */
23 public class Regression {
24     public static void main(String[] args) {
25         SparkConf conf = new SparkConf().setAppName("Regression").setMaster("local");
26         JavaSparkContext jsc = new JavaSparkContext(conf);
27         String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\ridge-data\\lpsa.data";
28         JavaRDD<String> data = jsc.textFile(path);
29 
30         JavaRDD<LabeledPoint> parsedData = data.map(new Function<String, LabeledPoint>() {
31             public LabeledPoint call(String line) throws Exception {
32                 String[] split = line.split(",");
33                 String[] features = split[1].split(" ");
34                 double[] v = new double[features.length];
35                 for (int i = 0; i < features.length - 1; i++) {
36                     v[i] = Double.parseDouble(features[i]);
37                 }
38 
39                 return new LabeledPoint(Double.parseDouble(split[0]), Vectors.dense(v));
40             }
41         }).cache();
42 
43         final LinearRegressionModel model = LinearRegressionWithSGD.train(parsedData.rdd(), 100);
44 
45         JavaRDD<Tuple2<Double, Double>> valuesAndLabels = parsedData.map(new Function<LabeledPoint, Tuple2<Double, Double>>() {
46             public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
47                 double predict = model.predict(labeledPoint.features());
48                 return new Tuple2<Double, Double>(predict, labeledPoint.label());
49             }
50         });
51 
52         Double MSE = new JavaDoubleRDD(valuesAndLabels.map(
53                 new Function<Tuple2<Double, Double>, Object>() {
54                     public Object call(Tuple2<Double, Double> dat) throws Exception {
55                         return Math.pow(dat._1 - dat._2, 2.0);
56                     }
57                 }
58         ).rdd()).mean();
59         System.out.println("training Mean Squared Error = " + MSE);
60 
61         // Save and load model
62 //        model.save(jsc.sc(), "myModelPath");
63 //        LinearRegressionModel sameModel = LinearRegressionModel.load(jsc.sc(), "myModelPath");
64     }
65 }
Regression

 

二。Decision Trees.

 spark MLlib Classification and regression 学习 随笔 第10张

 

problem specification parameters:   algo, numClasses, categoricalFeaturesInfo

stopping criteria : maxDepth, minInfoGain, minInstancePerNode

tunnable parameters: maxBins, impurity, 

spark MLlib Classification and regression 学习 随笔 第11张
package ML.DT;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.DecisionTree;
import org.apache.spark.mllib.tree.model.DecisionTreeModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2;

import java.util.HashMap;

/**
 * TODO
 *
 * @ClassName: classification
 * @author: DingH
 * @since: 2019/4/9 16:11
 */
public class classification {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("DTclassification").setMaster("local");
        JavaSparkContext jsc = new JavaSparkContext(conf);

        String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();

        JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.7, 0.3}, 11L);
        JavaRDD<LabeledPoint> trainningData = split[0];
        JavaRDD<LabeledPoint> test = split[1];

        int numsClasses = 2;
        HashMap<Integer, Integer> categoricalFeaturesInfo  = new HashMap<Integer, Integer>();
        String impurity = "gini";
        int maxDepth  = 1;
        int maxbins = 32;

        final DecisionTreeModel model = DecisionTree.trainClassifier(trainningData, numsClasses,categoricalFeaturesInfo, impurity, maxDepth,maxbins);
        JavaPairRDD<Double, Double> predictionAndLable = test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
            public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
                return new Tuple2<Double, Double>(model.predict(labeledPoint.features()), labeledPoint.label());
            }
        });

        double testErr = predictionAndLable.filter(new Function<Tuple2<Double, Double>, Boolean>() {
            public Boolean call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
                return !doubleDoubleTuple2._1().equals(doubleDoubleTuple2._2());
            }
        }).count() * 1.0 / test.count();

        System.out.println("Test Error: " + testErr);
        System.out.println("Learned classification tree model:\n" + model.toDebugString());

        // Save and load model
//        model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
//        DecisionTreeModel sameModel = DecisionTreeModel.load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");


    }
}
classification spark MLlib Classification and regression 学习 随笔 第13张
package ML.DT;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.DecisionTree;
import org.apache.spark.mllib.tree.model.DecisionTreeModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2;

import java.util.HashMap;
import java.util.Map;

/**
 * TODO
 *
 * @ClassName: Regression
 * @author: DingH
 * @since: 2019/4/9 16:33
 */
public class Regression {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample").setMaster("local");
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);

        // Load and parse the data file.
        String datapath = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
        // Split the data into training and test sets (30% held out for testing)
        JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
        JavaRDD<LabeledPoint> trainingData = splits[0];
        JavaRDD<LabeledPoint> testData = splits[1];

        // Set parameters.
        // Empty categoricalFeaturesInfo indicates all features are continuous.
        Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
        String impurity = "variance";
        Integer maxDepth = 5;
        Integer maxBins = 32;

        // Train a DecisionTree model.
        final DecisionTreeModel model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins);

        // Evaluate model on test instances and compute test error
        JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
            public Tuple2<Double, Double> call(LabeledPoint p) {
                return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
            }
        });

        Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
            public Double call(Tuple2<Double, Double> pl) {
                Double diff = pl._1() - pl._2();
                return diff * diff;
            }
        }).reduce(new Function2<Double, Double, Double>() {
            public Double call(Double a, Double b) {
                return a + b;
            }
        }) / data.count();

        System.out.println("Test Mean Squared Error: " + testMSE);
        System.out.println("Learned regression tree model:\n" + model.toDebugString());

        // Save and load model
//        model.save(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
//        DecisionTreeModel sameModel = DecisionTreeModel.load(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
    }
}
Regression

 

三。Random Forests

  样本随机,特征随机  

  featureSubsetStrategy - Number of features to consider for splits at each node. Supported: "auto", "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt".

spark MLlib Classification and regression 学习 随笔 第15张
package ML.RF;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.RandomForest;
import org.apache.spark.mllib.tree.model.RandomForestModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2;

import java.util.HashMap;

/**
 * TODO
 *
 * @ClassName: classification
 * @author: DingH
 * @since: 2019/4/9 16:58
 */
public class classification {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaRandomForestClassificationExample");
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);

        // Load and parse the data file.
        String datapath = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

        // Split the data into training and test sets (30% held out for testing)
        JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
        JavaRDD<LabeledPoint> trainingData = splits[0];
        JavaRDD<LabeledPoint> testData = splits[1];

        // Train a RandomForest model.
        // Empty categoricalFeaturesInfo indicates all features are continuous.
        Integer numClasses = 2;
        HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
        Integer numTrees = 3; // Use more in practice.
        String featureSubsetStrategy = "auto"; // Let the algorithm choose.
        String impurity = "gini";
        Integer maxDepth = 5;
        Integer maxBins = 32;
        Integer seed = 12345;

        final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed);

        // Evaluate model on test instances and compute test error
        JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
            public Tuple2<Double, Double> call(LabeledPoint p) {
              return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
            }
          });

        Double testErr =
          1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
            public Boolean call(Tuple2<Double, Double> pl) {
              return !pl._1().equals(pl._2());
            }
          }).count() / testData.count();

        System.out.println("Test Error: " + testErr);
        System.out.println("Learned classification forest model:\n" + model.toDebugString());

        // Save and load model
//        model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel");
//        RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),"target/tmp/myRandomForestClassificationModel");
    }

}
classification spark MLlib Classification and regression 学习 随笔 第17张
package ML.RF;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.RandomForest;
import org.apache.spark.mllib.tree.model.RandomForestModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2;

import java.util.HashMap;
import java.util.Map;

/**
 * TODO
 *
 * @ClassName: regression
 * @author: DingH
 * @since: 2019/4/9 17:50
 */
public class regression {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaRandomForestRegressionExample");
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
        // Load and parse the data file.
        String datapath = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

        // Split the data into training and test sets (30% held out for testing)
        JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
        JavaRDD<LabeledPoint> trainingData = splits[0];
        JavaRDD<LabeledPoint> testData = splits[1];

        // Set parameters.
        // Empty categoricalFeaturesInfo indicates all features are continuous.
        Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
        Integer numTrees = 3; // Use more in practice.
        String featureSubsetStrategy = "auto"; // Let the algorithm choose.
        String impurity = "variance";
        Integer maxDepth = 4;
        Integer maxBins = 32;
        Integer seed = 12345;
        // Train a RandomForest model.
        final RandomForestModel model = RandomForest.trainRegressor(trainingData,categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed);

        // Evaluate model on test instances and compute test error
        JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
            public Tuple2<Double, Double> call(LabeledPoint p) {
              return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
            }
          });
        Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
            public Double call(Tuple2<Double, Double> pl) {
              Double diff = pl._1() - pl._2();
              return diff * diff;
            }
          }).reduce(new Function2<Double, Double, Double>() {
            public Double call(Double a, Double b) {
              return a + b;
            }
          }) / testData.count();

        System.out.println("Test Mean Squared Error: " + testMSE);
        System.out.println("Learned regression forest model:\n" + model.toDebugString());

        // Save and load model
        model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel");
        RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
          "target/tmp/myRandomForestRegressionModel");
    }
}
regression

 

四。Gradient-Boosted Trees

  spark MLlib Classification and regression 学习 随笔 第19张

 

Usage tips: loss, numIterations, learningRate,  algo

BoostingStrategy.validationTol

spark MLlib Classification and regression 学习 随笔 第20张
package ML.GradientBoostedTrees;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.mllib.regression.LabeledPoint;
import org.apache.spark.mllib.tree.GradientBoostedTrees;
import org.apache.spark.mllib.tree.configuration.BoostingStrategy;
import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel;
import org.apache.spark.mllib.util.MLUtils;
import scala.Tuple2;

import java.util.HashMap;
import java.util.Map;

/**
 * TODO
 *
 * @ClassName: classification
 * @author: DingH
 * @since: 2019/4/9 17:56
 */
public class classification {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("JavaGradientBoostedTreesClassificationExample");
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);

        // Load and parse the data file.
        String datapath = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
        JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

        // Split the data into training and test sets (30% held out for testing)
        JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
        JavaRDD<LabeledPoint> trainingData = splits[0];
        JavaRDD<LabeledPoint> testData = splits[1];

        // Train a GradientBoostedTrees model.
        // The defaultParams for Classification use LogLoss by default.
        BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Classification");
        boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
        boostingStrategy.getTreeStrategy().setNumClasses(2);
        boostingStrategy.getTreeStrategy().setMaxDepth(5);
        // Empty categoricalFeaturesInfo indicates all features are continuous.
        Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
        boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);

        final GradientBoostedTreesModel model =
          GradientBoostedTrees.train(trainingData, boostingStrategy);

        // Evaluate model on test instances and compute test error
        JavaPairRDD<Double, Double> predictionAndLabel =
          testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
            public Tuple2<Double, Double> call(LabeledPoint p) {
              return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
            }
          });
        Double testErr =
          1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
            public Boolean call(Tuple2<Double, Double> pl) {
              return !pl._1().equals(pl._2());
            }
          }).count() / testData.count();
        System.out.println("Test Error: " + testErr);
        System.out.println("Learned classification GBT model:\n" + model.toDebugString());

        // Save and load model
//        model.save(jsc.sc(), "target/tmp/myGradientBoostingClassificationModel");
//        GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
//          "target/tmp/myGradientBoostingClassificationModel");
    }
}
classification spark MLlib Classification and regression 学习 随笔 第22张 regression

 

五。naive Bayes

  model type : "multinomial","bernouli"

  

spark MLlib Classification and regression 学习 随笔 第23张
 1 package ML.ClassificationAndRegression.NaiveBayes;
 2 
 3 import org.apache.spark.SparkConf;
 4 import org.apache.spark.api.java.JavaRDD;
 5 import org.apache.spark.api.java.JavaSparkContext;
 6 import org.apache.spark.api.java.function.Function;
 7 import org.apache.spark.mllib.classification.NaiveBayes;
 8 import org.apache.spark.mllib.classification.NaiveBayesModel;
 9 import org.apache.spark.mllib.regression.LabeledPoint;
10 import org.apache.spark.mllib.util.MLUtils;
11 import scala.Tuple2;
12 
13 /**
14  * TODO
15  *
16  * @ClassName: example
17  * @author: DingH
18  * @since: 2019/4/10 10:04
19  */
20 public class example {
21     public static void main(String[] args) {
22         SparkConf conf = new SparkConf().setMaster("local").setAppName("naiveBayesExample");
23         JavaSparkContext jsc = new JavaSparkContext(conf);
24 
25         String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_libsvm_data.txt";
26         JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
27 
28         JavaRDD<LabeledPoint>[] split = data.randomSplit(new double[]{0.6, 0.4}, 12345L);
29         JavaRDD<LabeledPoint> train = split[0];
30         JavaRDD<LabeledPoint> test = split[1];
31 
32         final NaiveBayesModel model = NaiveBayes.train(train.rdd(), 1.0, "multinomial");
33 
34         JavaRDD<Tuple2<Double, Double>> predictionAndLabel = test.map(new Function<LabeledPoint, Tuple2<Double, Double>>() {
35             public Tuple2<Double, Double> call(LabeledPoint labeledPoint) throws Exception {
36                 double predict = model.predict(labeledPoint.features());
37                 return new Tuple2<Double, Double>(predict, labeledPoint.label());
38             }
39         });
40 
41         double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
42             public Boolean call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
43                 return doubleDoubleTuple2._1().equals(doubleDoubleTuple2._2());
44             }
45         }).count() / (double) test.count();
46 
47         System.out.println("acucuracy is : " + accuracy);
48 
49         NaiveBayesModel model1 = NaiveBayesModel.load(jsc.sc(), "");
50 
51 
52     }
53 }
naive Bayes

 

六。isotonic regression

  spark MLlib Classification and regression 学习 随笔 第25张

  

spark MLlib Classification and regression 学习 随笔 第26张
 1 package ML.ClassificationAndRegression.IsotonicRegression;
 2 
 3 import org.apache.spark.SparkConf;
 4 import org.apache.spark.api.java.JavaDoubleRDD;
 5 import org.apache.spark.api.java.JavaRDD;
 6 import org.apache.spark.api.java.JavaSparkContext;
 7 import org.apache.spark.api.java.function.Function;
 8 import org.apache.spark.mllib.regression.IsotonicRegression;
 9 import org.apache.spark.mllib.regression.IsotonicRegressionModel;
10 import scala.Tuple2;
11 import scala.Tuple3;
12 
13 /**
14  * TODO
15  *
16  * @ClassName: example
17  * @author: DingH
18  * @since: 2019/4/10 10:31
19  */
20 public class example {
21     public static void main(String[] args) {
22         SparkConf conf = new SparkConf().setMaster("local").setAppName("isotonicRegressionExample");
23         JavaSparkContext jsc = new JavaSparkContext(conf);
24 
25         String path = "D:\\IdeaProjects\\SimpleApp\\src\\main\\resources\\data\\mllib\\sample_isotonic_regression_data.txt";
26         JavaRDD<String> data = jsc.textFile(path);
27 
28         JavaRDD<Tuple3<Double, Double, Double>> parsedData = data.map(new Function<String, Tuple3<Double, Double, Double>>() {
29             public Tuple3<Double, Double, Double> call(String s) throws Exception {
30                 String[] strings = s.split(",");
31                 return new Tuple3<Double, Double, Double>(Double.parseDouble(strings[0]), Double.parseDouble(strings[1]), 1.0);
32             }
33         });
34 
35         JavaRDD<Tuple3<Double, Double, Double>>[] split = parsedData.randomSplit(new double[]{0.7, 0.3}, 1234L);
36         JavaRDD<Tuple3<Double, Double, Double>> train = split[0];
37         JavaRDD<Tuple3<Double, Double, Double>> test = split[1];
38 
39         final IsotonicRegressionModel model = new IsotonicRegression().setIsotonic(true).run(train);
40 
41         JavaRDD<Tuple2<Double, Double>> preditionAndLabel = test.map(new Function<Tuple3<Double, Double, Double>, Tuple2<Double, Double>>() {
42             public Tuple2<Double, Double> call(Tuple3<Double, Double, Double> doubleDoubleDoubleTuple3) throws Exception {
43                 double predict = model.predict(doubleDoubleDoubleTuple3._1());
44                 return new Tuple2<Double, Double>(predict, doubleDoubleDoubleTuple3._2());
45             }
46         });
47 
48         Double MSE = new JavaDoubleRDD(preditionAndLabel.map(new Function<Tuple2<Double, Double>, Object>() {
49             public Object call(Tuple2<Double, Double> doubleDoubleTuple2) throws Exception {
50 
51                 return Math.pow(doubleDoubleTuple2._1() - doubleDoubleTuple2._2(), 2.0);
52             }
53         }).rdd()).mean();
54 
55         System.out.println("Mean Squared Error = " + MSE);
56     }
57 }
isotonic regression

 

      

      

    

 

扫码关注我们
微信号:SRE实战
拒绝背锅 运筹帷幄