I am trying to do a relatively simple import of the module phonenumbers in Python.
I have tested the module on a seperate python file without any other imports and it works completely fine.
These are the packages I have installed:
from __future__ import absolute_import
from __future__ import print_function
import argparse
import csv
import logging
import os
import phonenumbers
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
And this is is my error message:
Traceback (most recent call last):
File "clean.py", line 114, in <module>
run()
File "clean.py", line 109, in run
| 'WriteOutputText' >> beam.io.WriteToText(known_args.output))
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 389, in __exit__
self.run().wait_until_finish()
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\dataflow_runner.py", line 996, in wait_until_finish
(self.state, getattr(self._runner, 'last_error_msg', None)), self)
apache_beam.runners.dataflow.dataflow_runner.DataflowRuntimeException: Dataflow pipeline failed. State: FAILED, Error:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 733, in run
self._load_main_session(self.local_staging_directory)
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 472, in _load_main_session
pickler.load_session(session_file)
File "/usr/local/lib/python2.7/dist-packages/apache_beam/internal/pickler.py", line 247, in load_session
return dill.load_session(file_path)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 363, in load_session
module = unpickler.load()
File "/usr/lib/python2.7/pickle.py", line 864, in load
dispatch[key](self)
File "/usr/lib/python2.7/pickle.py", line 1139, in load_reduce
value = func(*args)
File "/usr/local/lib/python2.7/dist-packages/dill/dill.py", line 766, in _import_module
return __import__(import_name)
ImportError: No module named phonenumbers
Any help would be greatly appreciated, thanks!
EDIT : I have already installed phonenumbers with pip
$ pip install phonenumbers
Requirement already satisfied: phonenumbers in c:\python27\lib\site-packages (8.9.7)
gax-google-logging-v2 0.8.3 has requirement google-gax<0.13.0,>=0.12.5, but you'll have google-gax 0.15.16 which is incompatible.
gcloud 0.18.3 has requirement google-gax<0.13dev,>=0.12.3, but you'll have google-gax 0.15.16 which is incompatible.
google-cloud-vision 0.29.0 has requirement requests<3.0dev,>=2.18.4, but you'll have requests 2.18.2 which is incompatible.
gax-google-pubsub-v1 0.8.3 has requirement google-gax<0.13.0,>=0.12.5, but you'll have google-gax 0.15.16 which is incompatible.
google-cloud-spanner 0.29.0 has requirement requests<3.0dev,>=2.18.4, but you'll have requests 2.18.2 which is incompatible.
This is the pip freeze output
$ pip freeze
adal==1.0.1
apache-beam==2.4.0
asn1crypto==0.22.0
avro==1.8.2
azure==3.0.0
azure-batch==4.1.3
azure-common==1.1.12
azure-cosmosdb-nspkg==2.0.2
azure-cosmosdb-table==1.0.3
azure-datalake-store==0.0.22
azure-eventgrid==0.1.0
azure-graphrbac==0.40.0
azure-keyvault==0.3.7
azure-mgmt==2.0.0
azure-mgmt-advisor==1.0.1
azure-mgmt-applicationinsights==0.1.1
azure-mgmt-authorization==0.30.0
azure-mgmt-batch==5.0.1
azure-mgmt-batchai==0.2.0
azure-mgmt-billing==0.1.0
azure-mgmt-cdn==2.0.0
azure-mgmt-cognitiveservices==2.0.0
azure-mgmt-commerce==1.0.1
azure-mgmt-compute==3.0.1
azure-mgmt-consumption==2.0.0
azure-mgmt-containerinstance==0.3.1
azure-mgmt-containerregistry==1.0.1
azure-mgmt-containerservice==3.0.1
azure-mgmt-cosmosdb==0.3.1
azure-mgmt-datafactory==0.4.0
azure-mgmt-datalake-analytics==0.3.0
azure-mgmt-datalake-nspkg==2.0.0
azure-mgmt-datalake-store==0.3.0
azure-mgmt-devtestlabs==2.2.0
azure-mgmt-dns==1.2.0
azure-mgmt-eventgrid==0.4.0
azure-mgmt-eventhub==1.2.0
azure-mgmt-hanaonazure==0.1.1
azure-mgmt-iothub==0.4.0
azure-mgmt-iothubprovisioningservices==0.1.0
azure-mgmt-keyvault==0.40.0
azure-mgmt-loganalytics==0.1.0
azure-mgmt-logic==2.1.0
azure-mgmt-machinelearningcompute==0.4.1
azure-mgmt-managementpartner==0.1.0
azure-mgmt-marketplaceordering==0.1.0
azure-mgmt-media==0.2.0
azure-mgmt-monitor==0.4.0
azure-mgmt-msi==0.1.0
azure-mgmt-network==1.7.1
azure-mgmt-notificationhubs==1.0.0
azure-mgmt-nspkg==2.0.0
azure-mgmt-powerbiembedded==1.0.0
azure-mgmt-rdbms==0.1.0
azure-mgmt-recoveryservices==0.2.0
azure-mgmt-recoveryservicesbackup==0.1.1
azure-mgmt-redis==5.0.0
azure-mgmt-relay==0.1.0
azure-mgmt-reservations==0.1.0
azure-mgmt-resource==1.2.2
azure-mgmt-scheduler==1.1.3
azure-mgmt-search==1.0.0
azure-mgmt-servermanager==1.2.0
azure-mgmt-servicebus==0.4.0
azure-mgmt-servicefabric==0.1.0
azure-mgmt-sql==0.8.6
azure-mgmt-storage==1.5.0
azure-mgmt-subscription==0.1.0
azure-mgmt-trafficmanager==0.40.0
azure-mgmt-web==0.34.1
azure-nspkg==2.0.0
azure-servicebus==0.21.1
azure-servicefabric==6.1.2.9
azure-servicemanagement-legacy==0.20.6
azure-storage-blob==1.1.0
azure-storage-common==1.1.0
azure-storage-file==1.1.0
azure-storage-nspkg==3.0.0
azure-storage-queue==1.1.0
CacheControl==0.12.5
cachetools==2.1.0
certifi==2017.7.27.1
cffi==1.10.0
chardet==3.0.4
click==6.7
configparser==3.5.0
crcmod==1.7
cryptography==2.0.3
deprecation==2.0.3
dill==0.2.6
docopt==0.6.2
entrypoints==0.2.3
enum34==1.1.6
fasteners==0.14.1
firebase-admin==2.11.0
Flask==0.12.2
funcsigs==1.0.2
future==0.16.0
futures==3.2.0
gapic-google-cloud-datastore-v1==0.15.3
gapic-google-cloud-error-reporting-v1beta1==0.15.3
gapic-google-cloud-logging-v2==0.91.3
gapic-google-cloud-pubsub-v1==0.15.4
gax-google-logging-v2==0.8.3
gax-google-pubsub-v1==0.8.3
gcloud==0.18.3
google-api-core==0.1.4
google-apitools==0.5.20
google-auth==1.5.0
google-auth-httplib2==0.0.3
google-auth-oauthlib==0.2.0
google-cloud==0.33.1
google-cloud-bigquery==0.28.0
google-cloud-bigquery-datatransfer==0.1.1
google-cloud-bigtable==0.28.1
google-cloud-container==0.1.1
google-cloud-core==0.28.1
google-cloud-dataflow==2.4.0
google-cloud-datastore==1.4.0
google-cloud-dns==0.28.0
google-cloud-error-reporting==0.28.0
google-cloud-firestore==0.28.0
google-cloud-language==1.0.2
google-cloud-logging==1.4.0
google-cloud-monitoring==0.28.1
google-cloud-pubsub==0.30.1
google-cloud-resource-manager==0.28.1
google-cloud-runtimeconfig==0.28.1
google-cloud-spanner==0.29.0
google-cloud-speech==0.30.0
google-cloud-storage==1.6.0
google-cloud-trace==0.17.0
google-cloud-translate==1.3.1
google-cloud-videointelligence==1.0.1
google-cloud-vision==0.29.0
google-gax==0.15.16
google-resumable-media==0.3.1
googleapis-common-protos==1.5.3
googledatastore==7.0.1
grpc-google-iam-v1==0.11.4
grpc-google-logging-v2==0.8.1
grpc-google-pubsub-v1==0.8.1
grpcio==1.12.0
gunicorn==19.7.1
hdfs==2.1.0
httplib2==0.9.2
idna==2.5
ipaddress==1.0.18
iso8601==0.1.12
isodate==0.6.0
itsdangerous==0.24
Jinja2==2.9.6
jmespath==0.9.3
keyring==12.2.1
keystoneauth1==3.8.0
linecache2==1.0.0
MarkupSafe==1.0
mock==2.0.0
monotonic==1.5
msgpack==0.5.6
msrest==0.5.0
msrestazure==0.4.32
ndg-httpsclient==0.4.2
nelson==0.4.0
oauth2client==3.0.0
oauthlib==2.1.0
os-service-types==1.2.0
packaging==17.1
pathlib2==2.3.2
pbr==4.0.3
phonenumbers==8.9.7
ply==3.8
proto-google-cloud-datastore-v1==0.90.4
proto-google-cloud-error-reporting-v1beta1==0.15.3
proto-google-cloud-logging-v2==0.91.3
proto-google-cloud-pubsub-v1==0.15.4
protobuf==3.5.2.post1
psutil==5.4.6
psycopg2==2.7.3.2
pyasn1==0.4.3
pyasn1-modules==0.2.1
pycparser==2.18
pyjwt==1.5.0
pyOpenSSL==17.2.0
pyparsing==2.2.0
pyreadline==2.1
python-dateutil==2.7.3
pytz==2018.3
PyVCF==0.6.8
pywin32-ctypes==0.1.2
PyYAML==3.12
rackspaceauth==0.2.0
requests==2.18.2
requests-oauthlib==0.8.0
requests-toolbelt==0.8.0
rsa==3.4.2
scandir==1.7
six==1.10.0
SQLAlchemy==1.1.14
stevedore==1.28.0
traceback2==1.4.0
twilio==6.5.0
typing==3.6.4
unittest2==1.1.0
urllib3==1.22
virtualenv==16.0.0
Werkzeug==0.12.2
EDIT: Code ==
from __future__ import absolute_import
from __future__ import print_function
import argparse
import csv
import logging
import os
from collections import OrderedDict
import phonenumbers
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
class ParseCSVFn(beam.DoFn):
"""Parses the raw CSV data into a Python dictionary."""
def process(self, elem):
try:
row = list(csv.reader([elem]))[0]
month, day, year = row[2].split('/')
birth_dict = {
'day': day,
'month': month,
'year': year,
}
order_dict = OrderedDict(birth_dict)
data_dict = {
'phoneNumber': row[4],
'firstName': row[0],
'lastName': row[1],
'birthDate': order_dict,
'voterId': row[3],
}
order_data_dict = OrderedDict(data_dict)
yield order_data_dict
except:
pass
def run(argv=None):
"""Pipeline entry point, runs the all the necessary processes"""
parser = argparse.ArgumentParser()
parser.add_argument('--input',
type=str,
dest='input',
default='gs://wordcount_project/demo-contacts-small*.csv',
help='Input file to process.')
parser.add_argument('--output',
dest='output',
# CHANGE 1/5: The Google Cloud Storage path is required
# for outputting the results.
default='gs://wordcount_project/cleaned.csv',
help='Output file to write results to.')
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_args.extend([
# CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
# run your pipeline on the Google Cloud Dataflow Service.
'--runner=DataflowRunner',
# CHANGE 3/5: Your project ID is required in order to run your pipeline on
# the Google Cloud Dataflow Service.
'--project=--------',
# CHANGE 4/5: Your Google Cloud Storage path is required for staging local
# files.
# '--dataset=game_dataset',
'--staging_location=gs://wordcount_project/staging',
# CHANGE 5/5: Your Google Cloud Storage path is required for temporary
# files.
'--temp_location=gs://wordcount_project/temp',
'--job_name=cleaning-jobs',
])
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
with beam.Pipeline(options=pipeline_options) as p:
(p
| 'ReadInputText' >> beam.io.ReadFromText(known_args.input)
| 'ParseDataFn' >> beam.ParDo(ParseCSVFn())
# | 'JsonBirthDay' >> beam.ParDo(JsonBirthDay())
# | 'MatchNumber' >> beam.ParDo(MatchNumber('phoneNumber'))
# | 'MapData' >> beam.Map(lambda elem: (elem['phoneNumber'], elem['firstName'], elem['lastName'],
# elem['birthDate'], elem['voterId']))
| 'WriteOutputText' >> beam.io.WriteToText(known_args.output))
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
I have also tried to install specific packages of google-gax and requests but it didn't seem to help
EDIT: new coding error:
File "new_clean.py", line 226, in <module>
run()
File "new_clean.py", line 219, in run
| 'WriteToText' >> beam.io.WriteToText(known_args.output)
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 389, in __exit__
self.run().wait_until_finish()
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 369, in run
self.to_runner_api(), self.runner, self._options).run(False)
File "C:\Python27\lib\site-packages\apache_beam\pipeline.py", line 382, in run
return self.runner.run_pipeline(self)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\dataflow_runner.py", line 324, in run_pipeline
self.dataflow_client.create_job(self.job), self)
File "C:\Python27\lib\site-packages\apache_beam\utils\retry.py", line 180, in wrapper
return fun(*args, **kwargs)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\apiclient.py", line 461, in create_job
self.create_job_description(job)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\apiclient.py", line 491, in create_job_description
job.options, file_copy=self._gcs_file_copy)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\dependency.py", line 328, in stage_job_resources
setup_options.requirements_file, requirements_cache_path)
File "C:\Python27\lib\site-packages\apache_beam\runners\dataflow\internal\dependency.py", line 262, in _populate_requirements_cache
processes.check_call(cmd_args)
File "C:\Python27\lib\site-packages\apache_beam\utils\processes.py", line 44, in check_call
return subprocess.check_call(*args, **kwargs)
File "C:\Python27\lib\subprocess.py", line 186, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['C:\\Python27\\python.exe', '-m', 'pip', 'download', '--dest', 'c:\\users\\james\\appdata\\local\\temp\\dataflow-requirements-cache', '-r', 'requirements.txt', '--no-binary', ':all:']' returned non-zero exit status 1
It may be that Dataflow is not receiving the file with the extra dependencies of your pipeline. To install them, you'd do this:
pip freeze > requirements.txt
Then you'll want to edit the requirements.txt
file and leave only the packages that were installed from PyPI and are used in your pipeline.
When you run your pipeline, pass the following command-line option:
--requirements_file requirements.txt
This is documented in the Apache Beam's Python Pipeline Dependencies docs .
Hope that helps.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.