Skip to content
This repository has been archived by the owner on May 1, 2024. It is now read-only.

Commit

Permalink
Enable logging in spark tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
rao-abdul-mannan committed Jun 7, 2018
1 parent b863f58 commit 744684e
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions edx/analytics/tasks/common/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,11 @@ def get_input_source(self, *args):
# Reading manifest as rdd with spark is alot faster as compared to hadoop.
# Currently, we're getting only 1 manifest file per request, so we will create a single rdd from it.
# If there are multiple manifest files, each file can be read as rdd and then union it with other manifest rdds
self.log.warn("PYSPARK LOGGER: Reading manifest file :: {} ".format(targets[0].path))
source_rdd = self._spark.sparkContext.textFile(targets[0].path)
broadcast_value = self._spark.sparkContext.broadcast(source_rdd.collect())
else:
self.log.warn("PYSPARK LOGGER: Reading normal targets")
broadcast_value = self._spark.sparkContext.broadcast([target.path for target in targets])
return broadcast_value

Expand Down Expand Up @@ -298,6 +300,8 @@ def init_spark(self, sc):
self._spark_context = sc
self._spark = SparkSession.builder.getOrCreate()
self._hive_context = HiveContext(sc)
log4jLogger = sc._jvm.org.apache.log4j # using spark logger
self.log = log4jLogger.LogManager.getLogger(__name__)

@property
def conf(self):
Expand Down

0 comments on commit 744684e

Please sign in to comment.