I want to log to the standard logger inside an executor during transformation with log levels and formatting respected. Unfortunately I can\'t get access to the log4j logger obj
After a few hours of digging into the spark repository, it seems this is impossible to achieve currently. The executor doesn't actually have a jvm instance it's attached to, the data is just streamed over the socket without a jvm native binding to utilize.
Here's the worker creation code that streams the error messages to stderr:
private def createSimpleWorker(): Socket = {
...
val worker = pb.start()
// Redirect worker stdout and stderr
redirectStreamsToStderr(worker.getInputStream, worker.getErrorStream)
...
}
/**
* Redirect the given streams to our stderr in separate threads.
*/
private def redirectStreamsToStderr(stdout: InputStream, stderr: InputStream) {
try {
new RedirectThread(stdout, System.err, "stdout reader for " + pythonExec).start()
new RedirectThread(stderr, System.err, "stderr reader for " + pythonExec).start()
} catch {
case e: Exception =>
logError("Exception in redirecting streams", e)
}
}
And here's the worker.py code for communicating the job processing. There's no place to emit log messages or message type which indicates a log event.
try:
...
command = pickleSer._read_with_length(infile)
if isinstance(command, Broadcast):
command = pickleSer.loads(command.value)
func, profiler, deserializer, serializer = command
init_time = time.time()
def process():
iterator = deserializer.load_stream(infile)
serializer.dump_stream(func(split_index, iterator), outfile)
if profiler:
profiler.profile(process)
else:
process()
except Exception:
try:
write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
write_with_length(traceback.format_exc().encode("utf-8"), outfile)
except IOError:
# JVM close the socket
pass
except Exception:
# Write the error to stderr if it happened while serializing
print("PySpark worker failed with exception:", file=sys.stderr)
print(traceback.format_exc(), file=sys.stderr)
exit(-1)
finish_time = time.time()
report_times(outfile, boot_time, init_time, finish_time)
write_long(shuffle.MemoryBytesSpilled, outfile)
write_long(shuffle.DiskBytesSpilled, outfile)
# Mark the beginning of the accumulators section of the output
write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
write_int(len(_accumulatorRegistry), outfile)
for (aid, accum) in _accumulatorRegistry.items():
pickleSer._write_with_length((aid, accum._value), outfile)
...
And finally the message types available:
class SpecialLengths(object):
END_OF_DATA_SECTION = -1
PYTHON_EXCEPTION_THROWN = -2
TIMING_DATA = -3
END_OF_STREAM = -4
NULL = -5