问题
When I run my tensorflow model I am receiving this error InvalidArgumentError: Field 4 in record 0 is not a valid float: latency
[[Node: DecodeCSV = DecodeCSV[OUT_TYPE=[DT_STRING, DT_STRING, DT_STRING, DT_STRING, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_STRING, DT_STRING, DT_STRING, DT_STRING, DT_STRING, DT_STRING, DT_STRING], field_delim=",", na_value="", use_quote_delim=true](arg0, DecodeCSV/record_defaults_0, DecodeCSV/record_defaults_1, DecodeCSV/record_defaults_2, DecodeCSV/record_defaults_3, DecodeCSV/record_defaults_4, DecodeCSV/record_defaults_5, DecodeCSV/record_defaults_6, DecodeCSV/record_defaults_7, DecodeCSV/record_defaults_8, DecodeCSV/record_defaults_9, DecodeCSV/record_defaults_10, DecodeCSV/record_defaults_11, DecodeCSV/record_defaults_12, DecodeCSV/record_defaults_13)]]
[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?], [?], [?], [?], [?], [?], [?], [?], [?], [?], [?], [?], [?], [?]], output_types=[DT_STRING, DT_STRING, DT_FLOAT, DT_STRING, DT_STRING, DT_STRING, DT_STRING, DT_STRING, DT_STRING, DT_STRING, DT_FLOAT, DT_FLOAT, DT_STRING, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator)]]
I believe this issue is in the preprocessing step that creates the csv files that my model reads data from as it believes it should receive the first Node
pattern but instead gets the second Node
pattern.
This is my preprocessing code (I purposely cast the dtypes from the query to ensure they would be read correctly):
query = """
SELECT CAST(end_time AS STRING) AS end_time, CAST(device AS STRING) AS device, CAST(device_os AS STRING) AS device_os, CAST(device_os_version AS STRING) AS device_os_version, CAST(latency AS FLOAT) AS latency,
CAST(megacycles AS FLOAT) AS megacycles, CAST(cost AS FLOAT) AS cost, CAST(status AS STRING) AS Status, CAST(device_brand AS STRING) AS device_brand, CAST(device_family AS STRING) AS device_family,
CAST(browser_version AS STRING) AS browser_version, CAST(app AS STRING) AS app, CAST(ua_parse AS STRING) AS ua_parse
FROM [<mytable>:daily_logs.app_logs_data]
WHERE start_time >='2018-04-16'
GROUP BY end_time, device, device_os, device_os_version, latency, megacycles, cost, Status, device_brand, device_family, browser_version, app, ua_parse
"""
def preprocess_tft(inputs):
import copy
import numpy as np
def center(x):
return x - tft.mean(x)
result = copy.copy(inputs) # shallow copy
result['end_time'] = tft.string_to_int(inputs['end_time'])
result['device'] = tft.string_to_int(inputs['device'])
result['device_os'] = tft.string_to_int(inputs['device_os'])
result['device_os_version'] = tft.string_to_int(inputs['device_os_version'])
result['latency_tft'] = center(inputs['latency'])
result['megacycles_tft'] = center(inputs['megacycles'])
result['cost_tft'] = center(inputs['cost'])
result['Status'] = tft.string_to_int(inputs['Status'])
result['device_brand'] = tft.string_to_int(inputs['device_brand'])
result['device_family'] = tft.string_to_int(inputs['device_family'])
result['browser_version'] = tft.string_to_int(inputs['browser_version'])
result['app'] = tft.string_to_int(inputs['app'])
result['ua_parse'] = tft.string_to_int(inputs['ua_parse'])
return result
#return inputs
def cleanup(rowdict):
import copy, hashlib
CSV_COLUMNS ='end_time,device,device_os,device_os_version,latency,megacycles,cost,Status,device_brand,device_family,browser_version,app,ua_parse'.split(',')
STR_COLUMNS = 'key,end_time,device,device_os,device_os_version,Status,device_brand,device_family,browser_version,app,ua_parse'.split(',')
FLT_COLUMNS = 'latency,megacycles,cost'.split(',')
# add any missing columns, and correct the types
def tofloat(value, ifnot):
try:
return float(value)
except (ValueError, TypeError):
return ifnot
result = {
k : str(rowdict[k]) if k in rowdict else 'None' for k in STR_COLUMNS
}
result.update({
k : tofloat(rowdict[k], -99) if k in rowdict else -99 for k in FLT_COLUMNS
})
# cleanup: write out only the data we that we want to train on
if result['latency'] > 0 and result['megacycles'] > 0 and result['cost'] > 0:
data = ','.join([str(result[k]) for k in CSV_COLUMNS])
result['key'] = hashlib.sha224(data).hexdigest()
yield result
def preprocess(query, in_test_mode):
import os
import os.path
import tempfile
import tensorflow as tf
from apache_beam.io import tfrecordio
from tensorflow_transform.coders import example_proto_coder
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
job_name = 'preprocess-log-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')
if in_test_mode:
import shutil
print 'Launching local job ... hang on'
OUTPUT_DIR = './preproc_tft'
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
else:
print 'Launching Dataflow job {} ... hang on'.format(job_name)
OUTPUT_DIR = 'gs://{0}/logs2/preproc_tft/'.format(BUCKET)
import subprocess
subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
options = {
'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
'job_name': job_name,
'project': PROJECT,
'max_num_workers': 24,
'teardown_policy': 'TEARDOWN_ALWAYS',
'no_save_main_session': True,
'requirements_file': 'requirements.txt'
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)
if in_test_mode:
RUNNER = 'DirectRunner'
else:
RUNNER = 'DataflowRunner'
# set up metadata
raw_data_schema = {
colname : dataset_schema.ColumnSchema(tf.string, [], dataset_schema.FixedColumnRepresentation())
for colname in 'key,end_time,device,device_os,device_os_version,Status,device_brand,device_family,browser_version,app,ua_parse'.split(',')
}
raw_data_schema.update({
colname : dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
for colname in 'latency,megacycles,cost'.split(',')
})
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))
def read_rawdata(p, step, test_mode):
if step == 'train':
selquery = 'SELECT * FROM ({})'.format(query)
else:
selquery = 'SELECT * FROM ({})'.format(query)
if in_test_mode:
selquery = selquery + ' LIMIT 100'
#print 'Processing {} data from {}'.format(step, selquery)
return (p
| '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query=selquery, use_standard_sql=False))
| '{}_cleanup'.format(step) >> beam.FlatMap(cleanup)
)
# run Beam
with beam.Pipeline(RUNNER, options=opts) as p:
with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):
# analyze and transform training
raw_data = read_rawdata(p, 'train', in_test_mode)
raw_dataset = (raw_data, raw_data_metadata)
transformed_dataset, transform_fn = (
raw_dataset | beam_impl.AnalyzeAndTransformDataset(preprocess_tft))
transformed_data, transformed_metadata = transformed_dataset
_ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
os.path.join(OUTPUT_DIR, 'train'),
coder=example_proto_coder.ExampleProtoCoder(
transformed_metadata.schema))
# transform eval data
raw_test_data = read_rawdata(p, 'eval', in_test_mode)
raw_test_dataset = (raw_test_data, raw_data_metadata)
transformed_test_dataset = (
(raw_test_dataset, transform_fn) | beam_impl.TransformDataset())
transformed_test_data, _ = transformed_test_dataset
_ = transformed_test_data | 'WriteTestData' >> tfrecordio.WriteToTFRecord(
os.path.join(OUTPUT_DIR, 'eval'),
coder=example_proto_coder.ExampleProtoCoder(
transformed_metadata.schema))
_ = (transform_fn
| 'WriteTransformFn' >>
transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata')))
job = p.run()
if in_test_mode:
job.wait_until_finish()
print "Done!"
preprocess(query, in_test_mode=False)
Is there a way to print out the dtypes before I send it into the Beam pipeline so I can check the dtypes array and make sure it is valid? Is there something in the code that is causing the dtypes to be different or arranged in a different order then specified in the CSV_COLUMNS
variable?
来源:https://stackoverflow.com/questions/49911534/pre-processing-data-for-tensorflow-invalidargumenterror