/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.classifier.sgd; import com.google.common.base.Charsets; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.common.io.Closeables; import com.google.common.io.Files; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.list.IntArrayList; import org.apache.mahout.math.stats.OnlineSummarizer; import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder; import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder; import java.io.BufferedReader; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.nio.ByteBuffer; import java.util.List; import java.util.Random; /** * Shows how different encoding choices can make big speed differences. *
* Run with command line options --generate 1000000 test.csv to generate a million data lines in * test.csv. * * Run with command line options --parser test.csv to time how long it takes to parse and encode * those million data points * * Run with command line options --fast test.csv to time how long it takes to parse and encode those * million data points using byte-level parsing and direct value encoding. * * This doesn't demonstrate text encoding which is subject to somewhat different tricks. The basic * idea of caching hash locations and byte level parsing still very much applies to text, however. */ public final class SimpleCsvExamples { public static final char SEPARATOR_CHAR = '\t'; private static final int FIELDS = 100; private SimpleCsvExamples() { } public static void main(String[] args) throws IOException { FeatureVectorEncoder[] encoder = new FeatureVectorEncoder[FIELDS]; for (int i = 0; i < FIELDS; i++) { encoder[i] = new ConstantValueEncoder("v" + 1); } OnlineSummarizer[] s = new OnlineSummarizer[FIELDS]; for (int i = 0; i < FIELDS; i++) { s[i] = new OnlineSummarizer(); } long t0 = System.currentTimeMillis(); Vector v = new DenseVector(1000); if ("--generate".equals(args[0])) { PrintWriter out = new PrintWriter(new File(args[2])); try { int n = Integer.parseInt(args[1]); for (int i = 0; i < n; i++) { Line x = Line.generate(); out.println(x); } } finally { Closeables.closeQuietly(out); } } else if ("--parse".equals(args[0])) { BufferedReader in = Files.newReader(new File(args[1]), Charsets.UTF_8); try { String line = in.readLine(); while (line != null) { v.assign(0); Line x = new Line(line); for (int i = 0; i < FIELDS; i++) { s[i].add(x.getDouble(i)); encoder[i].addToVector(x.get(i), v); } line = in.readLine(); } } finally { Closeables.closeQuietly(in); } String separator = ""; for (int i = 0; i < FIELDS; i++) { System.out.printf("%s%.3f", separator, s[i].getMean()); separator = ","; } } else if ("--fast".equals(args[0])) { FastLineReader in = new FastLineReader(new FileInputStream(args[1])); try { FastLine line = in.read(); while (line != null) { v.assign(0); for (int i = 0; i < FIELDS; i++) { double z = line.getDouble(i); s[i].add(z); encoder[i].addToVector((byte[]) null, z, v); } line = in.read(); } } finally { Closeables.closeQuietly(in); } String separator = ""; for (int i = 0; i < FIELDS; i++) { System.out.printf("%s%.3f", separator, s[i].getMean()); separator = ","; } } System.out.printf("\nElapsed time = %.3f\n", (System.currentTimeMillis() - t0) / 1000.0); } private static final class Line { private static final Splitter ON_TABS = Splitter.on(SEPARATOR_CHAR).trimResults(); public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR_CHAR); public static final Random rand = RandomUtils.getRandom(); private final List