-
Notifications
You must be signed in to change notification settings - Fork 995
Added try-catch block in REST batch v2 openBatchSession #7409
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
248c48f
5d6f5dd
175fee0
6504d27
a3449d6
b06a0f4
303f4f8
ca25fb2
4a08894
c4f0db4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,57 +60,77 @@ class KyuubiBatchService( | |
| } | ||
|
|
||
| override def start(): Unit = { | ||
| val UNINITIALIZED_BATCH_ID = "UNINITIALIZED_BATCH_ID" | ||
| assert(running.compareAndSet(false, true)) | ||
| val submitTask: Runnable = () => { | ||
| restFrontend.waitForServerStarted() | ||
| while (running.get) { | ||
| metadataManager.pickBatchForSubmitting(kyuubiInstance) match { | ||
| case None => Thread.sleep(1000) | ||
| case Some(metadata) => | ||
| val batchId = metadata.identifier | ||
| info(s"$batchId is picked for submission.") | ||
| val batchSession = sessionManager.createBatchSession( | ||
| metadata.username, | ||
| "anonymous", | ||
| metadata.ipAddress, | ||
| metadata.requestConf, | ||
| metadata.engineType, | ||
| Option(metadata.requestName), | ||
| metadata.resource, | ||
| metadata.className, | ||
| metadata.requestArgs, | ||
| Some(metadata), | ||
| fromRecovery = false) | ||
| sessionManager.openBatchSession(batchSession) | ||
| var submitted = false | ||
| while (!submitted) { // block until batch job submitted | ||
| submitted = metadataManager.getBatchSessionMetadata(batchId) match { | ||
| case Some(metadata) if OperationState.isTerminal(metadata.opState) => | ||
| true | ||
| case Some(metadata) if metadata.opState == OperationState.RUNNING => | ||
| metadata.appState match { | ||
| // app that is not submitted to resource manager | ||
| case None | Some(ApplicationState.NOT_FOUND) => false | ||
| // app that is pending in resource manager while the local startup | ||
| // process is alive. For example, in Spark YARN cluster mode, if set | ||
| // spark.yarn.submit.waitAppCompletion=false, the local spark-submit | ||
| // process exits immediately once Application goes ACCEPTED status, | ||
| // even no resource could be allocated for the AM container. | ||
| case Some(ApplicationState.PENDING) if batchSession.startupProcessAlive => | ||
| false | ||
| // not sure, added for safe | ||
| case Some(ApplicationState.UNKNOWN) => false | ||
| case _ => true | ||
| } | ||
| case Some(_) => | ||
| false | ||
| case None => | ||
| error(s"$batchId does not existed in metastore, assume it is finished") | ||
| true | ||
| var batchId = UNINITIALIZED_BATCH_ID | ||
| try { | ||
| metadataManager.pickBatchForSubmitting(kyuubiInstance) match { | ||
| case None => Thread.sleep(1000) | ||
| case Some(metadata) => | ||
| batchId = metadata.identifier | ||
| info(s"$batchId is picked for submission.") | ||
| val batchSession = sessionManager.createBatchSession( | ||
| metadata.username, | ||
| "anonymous", | ||
| metadata.ipAddress, | ||
| metadata.requestConf, | ||
| metadata.engineType, | ||
| Option(metadata.requestName), | ||
| metadata.resource, | ||
| metadata.className, | ||
| metadata.requestArgs, | ||
| Some(metadata), | ||
| fromRecovery = false) | ||
| sessionManager.openBatchSession(batchSession) | ||
| var submitted = false | ||
| while (!submitted) { // block until batch job submitted | ||
| submitted = metadataManager.getBatchSessionMetadata(batchId) match { | ||
| case Some(metadata) if OperationState.isTerminal(metadata.opState) => | ||
| true | ||
| case Some(metadata) if metadata.opState == OperationState.RUNNING => | ||
| metadata.appState match { | ||
| // app that is not submitted to resource manager | ||
| case None | Some(ApplicationState.NOT_FOUND) => false | ||
| // app that is pending in resource manager while the local startup | ||
| // process is alive. For example, in Spark YARN cluster mode, if set | ||
| // spark.yarn.submit.waitAppCompletion=false, the local spark-submit | ||
| // process exits immediately once Application goes ACCEPTED status, | ||
| // even no resource could be allocated for the AM container. | ||
| case Some(ApplicationState.PENDING) if batchSession.startupProcessAlive => | ||
| false | ||
| // not sure, added for safe | ||
| case Some(ApplicationState.UNKNOWN) => false | ||
| case _ => true | ||
| } | ||
| case Some(_) => | ||
| false | ||
| case None => | ||
| error(s"$batchId does not existed in metastore, assume it is finished") | ||
| true | ||
| } | ||
| if (!submitted) Thread.sleep(1000) | ||
| } | ||
| info(s"$batchId is submitted or finished.") | ||
| } | ||
| } catch { | ||
| // If the batch session failed to open, reinitialize the batch state to ERROR | ||
| // This can be due to a DB error or batch_connection_limits exceeded | ||
| case e: Exception => | ||
| if (batchId == UNINITIALIZED_BATCH_ID) { | ||
| error(s"Error picking batch for submission", e) | ||
| } else { | ||
| error(s"Error opening batch session for $batchId", e) | ||
| try { | ||
| metadataManager.failScheduledBatch(batchId) | ||
| } catch { | ||
| case ex: Exception => | ||
| error(s"Unable to modify metadata for $batchId to ERROR", ex) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
if there are transient network issues, and code reaches here, will job still be stuck at the PENDING state?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes they would still stuck at PENDING state. However, since the thread doesn't die, submitter will continue to pick up another job.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so it requires the admin to reset the state manually? if so, let's say that in the error message
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you, error message is modified. |
||
| } | ||
| if (!submitted) Thread.sleep(1000) | ||
| } | ||
| info(s"$batchId is submitted or finished.") | ||
| Thread.sleep(1000) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the intention of this sleep
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can remove the sleep. We mostly observe the issue when there's server-wide network issue. Slowing down the submit process would reduce error rate. However, I do see this is not the only failure.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, I guess this is mostly caused by transient network issues, then sleep makes sense, let's keep it and add a comment to explain the intention.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you, Comment and sleep is added. |
||
| } | ||
|
oh0873 marked this conversation as resolved.
|
||
| } | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.