Write tfrecords from beam pipeline?

随声附和 提交于 2020-04-30 08:21:47

问题


I have some data in Map format and I want to convert them to tfrecords, using the beam pipeline. Here is my attempt to write the code. I have attempted this in python which works but I need to implement this in java as some business logic is there which I can't port to python. The corresponding working python implementation can be found here in this question.

import com.google.protobuf.ByteString;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.protobuf.ProtoCoder;
import org.apache.beam.sdk.io.TFRecordIO;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.commons.lang3.RandomStringUtils;
import org.tensorflow.example.BytesList;
import org.tensorflow.example.Example;
import org.tensorflow.example.Feature;
import org.tensorflow.example.Features;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

public class Sample {

    static class Foo extends DoFn<Map<String, String>, Example> {

        public static Feature stringToFeature(String value) {
            ByteString byteString = ByteString.copyFrom(value.getBytes(StandardCharsets.UTF_8));
            BytesList bytesList = BytesList.newBuilder().addValue(byteString).build();
            return Feature.newBuilder().setBytesList(bytesList).build();
        }

        public void processElement(@Element Map<String, String> element, OutputReceiver<Example> receiver) {

            Features features = Features.newBuilder()
                    .putFeature("foo", stringToFeature(element.get("foo")))
                    .putFeature("bar", stringToFeature(element.get("bar")))
                    .build();

            Example example = Example
                    .newBuilder()
                    .setFeatures(features)
                    .build();

            receiver.output(example);
        }

    }

    private static Map<String, String> generateRecord() {
        String[] keys = {"foo", "bar"};
        return IntStream.range(0,keys.length)
                .boxed()
                .collect(Collectors
                        .toMap(i -> keys[i],
                                i -> RandomStringUtils.randomAlphabetic(8)));
    }

    public static void main(String[] args) {

        List<Map<String, String>> records = new ArrayList<>();
        for (int i=0; i<10; i++) {
            records.add(generateRecord());
        }

        System.out.println(records);
        Pipeline p = Pipeline.create();

        p.apply("Input creation", Create.of(records))
                .apply("Encode to Exampple", ParDo.of(new Foo())).setCoder(ProtoCoder.of(Example.class))
                .apply("Write to disk",
                        TFRecordIO.write()
                                .to("output")
                                .withNumShards(2)
                                .withSuffix(".tfrecord"));

        p.run();


    }
}

For the above code I am getting the following error at compile time

Error:(70, 17) java: no suitable method found for apply(java.lang.String,org.apache.beam.sdk.io.TFRecordIO.Write)
    method org.apache.beam.sdk.values.PCollection.<OutputT>apply(org.apache.beam.sdk.transforms.PTransform<? super org.apache.beam.sdk.values.PCollection<org.tensorflow.example.Example>,OutputT>) is not applicable
      (cannot infer type-variable(s) OutputT
        (actual and formal argument lists differ in length))
    method org.apache.beam.sdk.values.PCollection.<OutputT>apply(java.lang.String,org.apache.beam.sdk.transforms.PTransform<? super org.apache.beam.sdk.values.PCollection<org.tensorflow.example.Example>,OutputT>) is not applicable
      (cannot infer type-variable(s) OutputT
        (argument mismatch; org.apache.beam.sdk.io.TFRecordIO.Write cannot be converted to org.apache.beam.sdk.transforms.PTransform<? super org.apache.beam.sdk.values.PCollection<org.tensorflow.example.Example>,OutputT>))

回答1:


input to TFRecordIO.write() should be byte[] so making following changes worked for me.

static class Foo extends DoFn<Map<String, String>, byte[]> {

    public static Feature stringToFeature(String value) {
        ByteString byteString = ByteString.copyFrom(value.getBytes(StandardCharsets.UTF_8));
        BytesList bytesList = BytesList.newBuilder().addValue(byteString).build();
        return Feature.newBuilder().setBytesList(bytesList).build();
    }

    public void processElement(@Element Map<String, String> element, OutputReceiver<byte[]> receiver) {

        Features features = Features.newBuilder()
                .putFeature("foo", stringToFeature(element.get("foo")))
                .putFeature("bar", stringToFeature(element.get("bar")))
                .build();

        Example example = Example
                .newBuilder()
                .setFeatures(features)
                .build();

        receiver.output(example.toByteArray());
    }

}



回答2:


You need to convert the input to TFRecordIO to be byte[]

You can do it by using a transform like

static class StringToByteArray extends DoFn<String, byte[]> {
 @ProcessElement
 public void processElement(ProcessContext c) {
  c.output(c.element().getBytes(Charsets.UTF_8));
 }
} 


来源:https://stackoverflow.com/questions/61270215/write-tfrecords-from-beam-pipeline

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!