How can one list all csv files in an HDFS location within the Spark Scala shell?

后端 未结 3 632
温柔的废话
温柔的废话 2021-01-05 06:27

The purpose of this is in order to manipulate and save a copy of each data file in a second location in HDFS. I will be using

RddName.coalesce(1).saveAsTex         


        
3条回答
  •  孤街浪徒
    2021-01-05 06:57

    I haven't tested it thoroughly but something like this seems to work:

    import org.apache.spark.deploy.SparkHadoopUtil
    import org.apache.hadoop.fs.{FileSystem, Path, LocatedFileStatus, RemoteIterator}
    import java.net.URI
    
    val path: String = ???
    
    val hconf = SparkHadoopUtil.get.newConfiguration(sc.getConf)
    val hdfs = FileSystem.get(hconf)
    val iter = hdfs.listFiles(new Path(path), false)
    
    def listFiles(iter: RemoteIterator[LocatedFileStatus]) = {
      def go(iter: RemoteIterator[LocatedFileStatus], acc: List[URI]): List[URI] = {
        if (iter.hasNext) {
          val uri = iter.next.getPath.toUri
          go(iter, uri :: acc)
        } else {
          acc
        }
      }
      go(iter, List.empty[java.net.URI])
    }
    
    listFiles(iter).filter(_.toString.endsWith(".csv"))
    

提交回复
热议问题