GiraphFileInputFormat

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.giraph.io.formats;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.InvalidInputException;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/*if[HADOOP_NON_SECURE]
else[HADOOP_NON_SECURE]
import org.apache.hadoop.mapreduce.security.TokenCache;
end[HADOOP_NON_SECURE]*/

/**
 * Provides functionality similar to {@link FileInputFormat},
 * but allows for different data sources (vertex and edge data).
 *
 * @param <K> Key
 * @param <V> Value
 */
public abstract class GiraphFileInputFormat<K, V>
    extends FileInputFormat<K, V> {
  /** Vertex input file paths. */
  public static final String VERTEX_INPUT_DIR = "giraph.vertex.input.dir";
  /** Edge input file paths. */
  public static final String EDGE_INPUT_DIR = "giraph.edge.input.dir";
  /** Number of vertex input files. */
  public static final String NUM_VERTEX_INPUT_FILES =
      "giraph.input.vertex.num.files";
  /** Number of edge input files. */
  public static final String NUM_EDGE_INPUT_FILES =
      "giraph.input.edge.num.files";

  /** Split slop. */
  private static final double SPLIT_SLOP = 1.1; // 10% slop

  /** Filter for hidden files. */
  private static final PathFilter HIDDEN_FILE_FILTER = new PathFilter() {
    public boolean accept(Path p) {
      String name = p.getName();
      return !name.startsWith("_") && !name.startsWith(".");
    }
  };

  /** Class logger. */
  private static final Logger LOG =
      Logger.getLogger(GiraphFileInputFormat.class);

  /**
   * Add a {@link org.apache.hadoop.fs.Path} to the list of vertex inputs.
   *
   * @param conf the Configuration to store the input paths
   * @param path {@link org.apache.hadoop.fs.Path} to be added to the list of
   *                                              vertex inputs
   */
  public static void addVertexInputPath(Configuration conf,
    Path path) throws IOException {
    String dirStr = pathToDirString(conf, path);
    String dirs = conf.get(VERTEX_INPUT_DIR);
    conf.set(VERTEX_INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr);
  }

  /**
   * Set the {@link Path} for vertex input.
   * @param conf Configuration to store in
   * @param path {@link Path} to set
   * @throws IOException on I/O errors
   */
  public static void setVertexInputPath(Configuration conf,
      Path path) throws IOException {
    conf.set(VERTEX_INPUT_DIR, pathToDirString(conf, path));
  }

  /**
   * Add a {@link org.apache.hadoop.fs.Path} to the list of edge inputs.
   *
   * @param conf the Configuration to store the input paths
   * @param path {@link org.apache.hadoop.fs.Path} to be added to the list of
   *                                              edge inputs
   */
  public static void addEdgeInputPath(Configuration conf,
    Path path) throws IOException {
    String dirStr = pathToDirString(conf, path);
    String dirs = conf.get(EDGE_INPUT_DIR);
    conf.set(EDGE_INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr);
  }

  /**
   * Set the {@link Path} for edge input.
   * @param conf Configuration to store in
   * @param path {@link Path} to set
   * @throws IOException on I/O errors
   */
  public static void setEdgeInputPath(Configuration conf,
      Path path) throws IOException {
    conf.set(EDGE_INPUT_DIR, pathToDirString(conf, path));
  }

  /**
   * Convert from a Path to a string.
   * This makes the path fully qualified and does escaping.
   *
   * @param conf Configuration to use
   * @param path Path to convert
   * @return String of escaped dir
   * @throws IOException on I/O errors
   */
  private static String pathToDirString(Configuration conf, Path path)
    throws IOException {
    path = path.getFileSystem(conf).makeQualified(path);
    return StringUtils.escapeString(path.toString());
  }

  /**
   * Get the list of vertex input {@link Path}s.
   *
   * @param context The job
   * @return The list of input {@link Path}s
   */
  public static Path[] getVertexInputPaths(JobContext context) {
    String dirs = context.getConfiguration().get(VERTEX_INPUT_DIR, "");
    String [] list = StringUtils.split(dirs);
    Path[] result = new Path[list.length];
    for (int i = 0; i < list.length; i++) {
      result[i] = new Path(StringUtils.unEscapeString(list[i]));
    }
    return result;
  }

  /**
   * Get the list of edge input {@link Path}s.
   *
   * @param context The job
   * @return The list of input {@link Path}s
   */
  public static Path[] getEdgeInputPaths(JobContext context) {
    String dirs = context.getConfiguration().get(EDGE_INPUT_DIR, "");
    String [] list = StringUtils.split(dirs);
    Path[] result = new Path[list.length];
    for (int i = 0; i < list.length; i++) {
      result[i] = new Path(StringUtils.unEscapeString(list[i]));
    }
    return result;
  }

  /**
   * Proxy PathFilter that accepts a path only if all filters given in the
   * constructor do. Used by the listPaths() to apply the built-in
   * HIDDEN_FILE_FILTER together with a user provided one (if any).
   */
  private static class MultiPathFilter implements PathFilter {
    /** List of filters. */
    private List<PathFilter> filters;

    /**
     * Constructor.
     *
     * @param filters The list of filters
     */
    public MultiPathFilter(List<PathFilter> filters) {
      this.filters = filters;
    }

    /**
     * True iff all filters accept the given path.
     *
     * @param path The path to check
     * @return Whether the path is accepted
     */
    public boolean accept(Path path) {
      for (PathFilter filter : filters) {
        if (!filter.accept(path)) {
          return false;
        }
      }
      return true;
    }
  }

  /**
   * Common method for listing vertex/edge input directories.
   *
   * @param job The job
   * @param dirs list of vertex/edge input paths
   * @return Array of FileStatus objects
   * @throws IOException
   */
  private List<FileStatus> listStatus(JobContext job, Path[] dirs)
    throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    if (dirs.length == 0) {
      throw new IOException("No input paths specified in job");
    }

/*if[HADOOP_NON_SECURE]
else[HADOOP_NON_SECURE]
    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs,
        job.getConfiguration());
end[HADOOP_NON_SECURE]*/

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the HIDDEN_FILE_FILTER and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(HIDDEN_FILE_FILTER);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
      filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (Path p : dirs) {
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      FileStatus[] matches = fs.globStatus(p, inputFilter);
      if (matches == null) {
        errors.add(new IOException("Input path does not exist: " + p));
      } else if (matches.length == 0) {
        errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
      } else {
        for (FileStatus globStat: matches) {
          if (globStat.isDir()) {
            Collections.addAll(result, fs.listStatus(globStat.getPath(),
                inputFilter));
          } else {
            result.add(globStat);
          }
        }
      }
    }

    if (!errors.isEmpty()) {
      throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
  }

  /**
   * List vertex input directories.
   *
   * @param job the job to list vertex input paths for
   * @return array of FileStatus objects
   * @throws IOException if zero items.
   */
  protected List<FileStatus> listVertexStatus(JobContext job)
    throws IOException {
    return listStatus(job, getVertexInputPaths(job));
  }

  /**
   * List edge input directories.
   *
   * @param job the job to list edge input paths for
   * @return array of FileStatus objects
   * @throws IOException if zero items.
   */
  protected List<FileStatus> listEdgeStatus(JobContext job)
    throws IOException {
    return listStatus(job, getEdgeInputPaths(job));
  }

  /**
   * Common method for generating the list of vertex/edge input splits.
   *
   * @param job The job
   * @param files Array of FileStatus objects for vertex/edge input files
   * @return The list of vertex/edge input splits
   * @throws IOException
   */
  private List<InputSplit> getSplits(JobContext job, List<FileStatus> files)
    throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();

    for (FileStatus file: files) {
      Path path = file.getPath();
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) {
        long blockSize = file.getBlockSize();
        long splitSize = computeSplitSize(blockSize, minSize, maxSize);

        long bytesRemaining = length;
        while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
          int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
          splits.add(new FileSplit(path, length - bytesRemaining, splitSize,
              blkLocations[blkIndex].getHosts()));
          bytesRemaining -= splitSize;
        }

        if (bytesRemaining != 0) {
          splits.add(new FileSplit(path, length - bytesRemaining,
              bytesRemaining,
              blkLocations[blkLocations.length - 1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else {
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }
    return splits;
  }

  /**
   * Generate the list of vertex input splits.
   *
   * @param job The job
   * @return The list of vertex input splits
   * @throws IOException
   */
  public List<InputSplit> getVertexSplits(JobContext job) throws IOException {
    List<FileStatus> files = listVertexStatus(job);
    List<InputSplit> splits = getSplits(job, files);
    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_VERTEX_INPUT_FILES, files.size());
    LOG.debug("Total # of vertex splits: " + splits.size());
    return splits;
  }

  /**
   * Generate the list of edge input splits.
   *
   * @param job The job
   * @return The list of edge input splits
   * @throws IOException
   */
  public List<InputSplit> getEdgeSplits(JobContext job) throws IOException {
    List<FileStatus> files = listEdgeStatus(job);
    List<InputSplit> splits = getSplits(job, files);
    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_EDGE_INPUT_FILES, files.size());
    LOG.debug("Total # of edge splits: " + splits.size());
    return splits;
  }
}