![](/img/trans.png)
[英]AWS Lambda Python function: import module error: module not found
[英]no module found when calling lambda function
我正在尝试在PySpark 1.6上运行python程序。 下面的脚本使用名为“ dateutil”的模块将时间从一个时区转换为另一个时区。 我已经检查了dateutil模块是否已安装在所有工作程序节点上以及用于提交作业的当前系统上。
执行命令:
spark-submit --packages "com.databricks:spark-csv_2.11:1.5.0" test.py
脚本:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = dateutil.tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = dateutil.tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()
错误:
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
process()
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/opt/cloudera/parcels/CDH-5.8.2-1.cdh5.8.2.p0.3/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1293, in takeUpToNumLeft
File "/home/xxxx/test.py", line 50, in <lambda>
File "/home/xxxx/test.py", line 34, in utcToAESTDateString
NameError: global name 'dateutil' is not defined
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
更改这些行utc_tz = tz.gettz('UTC')
和aest_time = tz.gettz('AEST')
因此,当您导入这样的特定方法时: from dateutil import tz
,您将无法执行类似dateutil.tz
的函数调用, dateutil.tz
必须执行tz()
。
您的代码应如下所示:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import HiveContext, Row, functions, SQLContext
from pyspark.sql.window import Window
import os, sys
import logging
import subprocess
import math
import csv
import functools
import pickle
from operator import add
from itertools import chain
import argparse
import hashlib
import cStringIO
import time
import copy
import datetime
import pytz
conf = SparkConf ()
conf.set('spark.kyroserializer.buffer.max', '32000')
conf.set('spark.scheduler.mode', 'FAIR')
sc = SparkContext(conf = conf, appName = "Testing dateutil...")
sqlContext = HiveContext (sc)
def utcToAESTDateString (row):
#import pytz
from dateutil import tz
utc_tz = tz.gettz('UTC')
#utc_tz = pytz.timezone('UTC')
utc_time = datetime.datetime.fromtimestamp(int(row["start time (unix time)"].decode())).replace(tzinfo=utc_time)
#print(utc_time.strftime('%Y-%m-%d %H:%M:%S'))
aest_time = tz.gettz('AEST')
math.acos (1)
#print(utc_time.astimezone(aest_time).strftime('%Y-%m-%d %H:%M:%S'))
#aedt_time = tz.gettz('AEDT')
#print(utc_time.astimezone(aedt_time).strftime('%Y-%m-%d %H:%M:%S'))
#return utc_time.astimezone(aedt_time).strftime('%Y-%m-%d')
return Row(sdate = unicode(utc_time.astimezone(aest_time).strftime('%Y-%m-%d'), "utf-8")) + row
sqlContext.createDataFrame(sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='false', quote='"')\
.load("/user/xxx/conviva/*")\
.filter("`start time (unix time)` <> '' AND `start time (unix time)` IS NOT NULL")\
.rdd\
.map(lambda y: utcToAESTDateString(y)))\
.registerTempTable("table1")
#sqlContext.sql ("""select * from table1 left join fixed_dart on table1.`_1` = fixed_dart.`_4` and table1.`_18` = fixed_dart.`_1`""").show()
sqlContext.sql ("""select * from table1 limit 10""").show()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.