sona
sona copied to clipboard
running model fit func error
run the demo of this page
import com.tencent.angel.sona.core.DriverContext
import org.apache.spark.angel.ml.classification.AngelClassifier
import org.apache.spark.angel.ml.feature.LabeledPoint
import org.apache.spark.angel.ml.linalg.Vectors
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrameReader, SparkSession}
val spark = SparkSession.builder()
.master("yarn-cluster")
.appName("AngelClassification")
.getOrCreate()
val sparkConf = spark.sparkContext.getConf
val driverCtx = DriverContext.get(sparkConf)
driverCtx.startAngelAndPSAgent()
val libsvm = spark.read.format("libsvmex")
val dummy = spark.read.format("dummy")
val trainData = libsvm.load("./data/angel/a9a/a9a_123d_train.libsvm")
val classifier = new AngelClassifier()
.setModelJsonFile("./angelml/src/test/jsons/logreg.json")
.setNumClass(2)
.setNumBatch(10)
.setMaxIter(2)
.setLearningRate(0.1)
val model = classifier.fit(trainData)
model.write.overwrite().save("trained_models/lr")
while run this line
val model = classifier.fit(trainData)
scala> val model = classifier.fit(trainData)
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.Vector, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.DenseVector, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.SparseVector, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.Matrix, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.DenseMatrix, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.SparseMatrix, which is already registered.
19/09/03 13:35:45 ERROR Executor: Exception in task 0.0 in stage 12.0 (TID 12)
java.lang.Exception: Pls. startAngel first!
at com.tencent.angel.sona.core.ExecutorContext.sparkWorkerContext$lzycompute(ExecutorContext.scala:32)
at com.tencent.angel.sona.core.ExecutorContext.sparkWorkerContext(ExecutorContext.scala:30)
at com.tencent.angel.sona.core.ExecutorContext$.checkGraphModelPool(ExecutorContext.scala:65)
at com.tencent.angel.sona.core.ExecutorContext$.toGraphModelPool(ExecutorContext.scala:78)
at org.apache.spark.angel.ml.common.Trainer.trainOneBatch(Trainer.scala:43)
at org.apache.spark.angel.ml.classification.AngelClassifier$$anonfun$train$1$$anonfun$apply$mcVI$sp$1$$anonfun$8.apply(AngelClassifier.scala:245)
at org.apache.spark.angel.ml.classification.AngelClassifier$$anonfun$train$1$$anonfun$apply$mcVI$sp$1$$anonfun$8.apply(AngelClassifier.scala:245)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.reduceLeft(TraversableOnce.scala:185)
at scala.collection.AbstractIterator.reduceLeft(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$14.apply(RDD.scala:1015)
at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$14.apply(RDD.scala:1013)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2123)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2123)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
19/09/03 13:35:45 WARN TaskSetManager: Lost task 0.0 in stage 12.0 (TID 12, localhost, executor driver): java.lang.Exception: Pls. startAngel first!
at com.tencent.angel.sona.core.ExecutorContext.sparkWorkerContext$lzycompute(ExecutorContext.scala:32)
at com.tencent.angel.sona.core.ExecutorContext.sparkWorkerContext(ExecutorContext.scala:30)
at com.tencent.angel.sona.core.ExecutorContext$.checkGraphModelPool(ExecutorContext.scala:65)
at com.tencent.angel.sona.core.ExecutorContext$.toGraphModelPool(ExecutorContext.scala:78)
at org.apache.spark.angel.ml.common.Trainer.trainOneBatch(Trainer.scala:43)
at org.apache.spark.angel.ml.classification.AngelClassifier$$anonfun$train$1$$anonfun$apply$mcVI$sp$1$$anonfun$8.apply(AngelClassifier.scala:245)
at org.apache.spark.angel.ml.classification.AngelClassifier$$anonfun$train$1$$anonfun$apply$mcVI$sp$1$$anonfun$8.apply(AngelClassifier.scala:245)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.TraversableOnce$class.reduceLeft(TraversableOnce.scala:185)
at scala.collection.AbstractIterator.reduceLeft(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$14.apply(RDD.scala:1015)
at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$14.apply(RDD.scala:1013)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2123)
at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2123)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
19/09/03 13:35:45 ERROR TaskSetManager: Task 0 in stage 12.0 failed 1 times; aborting job
my spark version is 2.3.0 and run this code in spark-shell
spark-shell \
--conf spark.ps.jars=$SONA_ANGEL_JARS \
--conf spark.ps.instances=10 \
--conf spark.ps.cores=2 \
--conf spark.ps.memory=6g \
--jars $SONA_SPARK_JARS \
--name "demo1" \
--driver-memory 10g \
--num-executors 10 \
--executor-cores 2 \
--executor-memory 4g
problem was fixed by export SPARK_HOME correct