sona icon indicating copy to clipboard operation
sona copied to clipboard

running model fit func error

Open Brentbin opened this issue 6 years ago • 1 comments

run the demo of this page

import com.tencent.angel.sona.core.DriverContext
import org.apache.spark.angel.ml.classification.AngelClassifier
import org.apache.spark.angel.ml.feature.LabeledPoint
import org.apache.spark.angel.ml.linalg.Vectors
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrameReader, SparkSession}

val spark = SparkSession.builder()
  .master("yarn-cluster")
  .appName("AngelClassification")
  .getOrCreate()

val sparkConf = spark.sparkContext.getConf
val driverCtx = DriverContext.get(sparkConf)

driverCtx.startAngelAndPSAgent()

val libsvm = spark.read.format("libsvmex")
val dummy = spark.read.format("dummy")

val trainData = libsvm.load("./data/angel/a9a/a9a_123d_train.libsvm")

val classifier = new AngelClassifier()
  .setModelJsonFile("./angelml/src/test/jsons/logreg.json")
  .setNumClass(2)
  .setNumBatch(10)
  .setMaxIter(2)
  .setLearningRate(0.1)

val model = classifier.fit(trainData)

model.write.overwrite().save("trained_models/lr")

while run this line

val model = classifier.fit(trainData)
scala> val model = classifier.fit(trainData)
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.Vector, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.DenseVector, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.SparseVector, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.Matrix, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.DenseMatrix, which is already registered.
19/09/03 13:35:44 WARN UDTRegistration: Cannot register UDT for org.apache.spark.angel.ml.linalg.SparseMatrix, which is already registered.
19/09/03 13:35:45 ERROR Executor: Exception in task 0.0 in stage 12.0 (TID 12)
java.lang.Exception: Pls. startAngel first!
	at com.tencent.angel.sona.core.ExecutorContext.sparkWorkerContext$lzycompute(ExecutorContext.scala:32)
	at com.tencent.angel.sona.core.ExecutorContext.sparkWorkerContext(ExecutorContext.scala:30)
	at com.tencent.angel.sona.core.ExecutorContext$.checkGraphModelPool(ExecutorContext.scala:65)
	at com.tencent.angel.sona.core.ExecutorContext$.toGraphModelPool(ExecutorContext.scala:78)
	at org.apache.spark.angel.ml.common.Trainer.trainOneBatch(Trainer.scala:43)
	at org.apache.spark.angel.ml.classification.AngelClassifier$$anonfun$train$1$$anonfun$apply$mcVI$sp$1$$anonfun$8.apply(AngelClassifier.scala:245)
	at org.apache.spark.angel.ml.classification.AngelClassifier$$anonfun$train$1$$anonfun$apply$mcVI$sp$1$$anonfun$8.apply(AngelClassifier.scala:245)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.reduceLeft(TraversableOnce.scala:185)
	at scala.collection.AbstractIterator.reduceLeft(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$14.apply(RDD.scala:1015)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$14.apply(RDD.scala:1013)
	at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2123)
	at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2123)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
19/09/03 13:35:45 WARN TaskSetManager: Lost task 0.0 in stage 12.0 (TID 12, localhost, executor driver): java.lang.Exception: Pls. startAngel first!
	at com.tencent.angel.sona.core.ExecutorContext.sparkWorkerContext$lzycompute(ExecutorContext.scala:32)
	at com.tencent.angel.sona.core.ExecutorContext.sparkWorkerContext(ExecutorContext.scala:30)
	at com.tencent.angel.sona.core.ExecutorContext$.checkGraphModelPool(ExecutorContext.scala:65)
	at com.tencent.angel.sona.core.ExecutorContext$.toGraphModelPool(ExecutorContext.scala:78)
	at org.apache.spark.angel.ml.common.Trainer.trainOneBatch(Trainer.scala:43)
	at org.apache.spark.angel.ml.classification.AngelClassifier$$anonfun$train$1$$anonfun$apply$mcVI$sp$1$$anonfun$8.apply(AngelClassifier.scala:245)
	at org.apache.spark.angel.ml.classification.AngelClassifier$$anonfun$train$1$$anonfun$apply$mcVI$sp$1$$anonfun$8.apply(AngelClassifier.scala:245)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
	at scala.collection.TraversableOnce$class.reduceLeft(TraversableOnce.scala:185)
	at scala.collection.AbstractIterator.reduceLeft(Iterator.scala:1336)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$14.apply(RDD.scala:1015)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1$$anonfun$14.apply(RDD.scala:1013)
	at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2123)
	at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:2123)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

19/09/03 13:35:45 ERROR TaskSetManager: Task 0 in stage 12.0 failed 1 times; aborting job

my spark version is 2.3.0 and run this code in spark-shell

spark-shell \
  --conf spark.ps.jars=$SONA_ANGEL_JARS \
  --conf spark.ps.instances=10 \
  --conf spark.ps.cores=2 \
  --conf spark.ps.memory=6g \
  --jars $SONA_SPARK_JARS \
  --name "demo1" \
  --driver-memory 10g \
  --num-executors 10 \
  --executor-cores 2 \
  --executor-memory 4g

Brentbin avatar Sep 03 '19 06:09 Brentbin

problem was fixed by export SPARK_HOME correct

Brentbin avatar Sep 06 '19 06:09 Brentbin