История изменений
Исправление
Legioner,
(текущая версия)
:
Анонимус вдохновил. Первая версия, код тупо в лоб.
package test;
import java.io.*;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
public class WordsCounter {
public static void main(String[] args) throws IOException {
var startTime = System.nanoTime();
var wordStats = new HashMap<String, WordStat>();
var inputPath = Path.of("C:\\Users\\Vladimir\\Downloads\\huge.txt");
try (var fileChannel = FileChannel.open(inputPath)) {
MappedByteBuffer mappedBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size());
var wordBuilder = new StringBuilder();
while (mappedBuffer.hasRemaining()) {
byte b = mappedBuffer.get();
if (('A' <= b) && (b <= 'Z')) {
wordBuilder.append((char) (b + ('a' - 'A')));
} else if (('a' <= b) && (b <= 'z')) {
wordBuilder.append((char) b);
} else {
if (!wordBuilder.isEmpty()) {
var word = wordBuilder.toString();
var wordStat = wordStats.get(word);
if (wordStat == null) {
wordStat = new WordStat(word);
wordStats.put(word, wordStat);
} else {
wordStat.incCount();
}
wordBuilder.delete(0, wordBuilder.length());
}
}
}
}
var part1Time = System.nanoTime();
System.out.println("Part 1: " + (part1Time - startTime));
var wordStatList = new ArrayList<>(wordStats.values());
wordStatList.sort(Comparator.comparing(WordStat::count).reversed());
var part2Time = System.nanoTime();
System.out.println("Part 2: " + (part2Time - part1Time));
var resultPath = Path.of("tmp\\result.txt");
try (var writer = Files.newBufferedWriter(resultPath)) {
for (WordStat wordStat : wordStatList) {
writer.write(Integer.toString(wordStat.count()));
writer.write(' ');
writer.write(wordStat.word());
writer.newLine();
}
}
var part3Time = System.nanoTime();
System.out.println("Part 3: " + (part3Time - part2Time));
System.out.println("Total: " + (part3Time - startTime));
}
static class WordStat {
private final String word;
private int count;
WordStat(String word) {
this.word = word;
count = 1;
}
String word() {
return word;
}
int count() {
return count;
}
void incCount() {
count++;
}
}
}
Отрабатывает за
Part 1: 4861061700
Part 2: 62542400
Part 3: 59626800
Total: 4983230900
Ну т.е. за 5 секунд примерно.
Сейчас попробую все 4 ядра загрузить. Думаю, секунду-другую есть шансы выиграть.
Исходная версия
Legioner,
:
Анонимус вдохновил. Первая версия, код тупо в лоб.
[code=java] package test;
import java.io.; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.;
public class WordsCounter { public static void main(String[] args) throws IOException { var startTime = System.nanoTime();
var wordStats = new HashMap<String, WordStat>();
var inputPath = Path.of("C:\\Users\\Vladimir\\Downloads\\huge.txt");
try (var fileChannel = FileChannel.open(inputPath)) {
MappedByteBuffer mappedBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size());
var wordBuilder = new StringBuilder();
while (mappedBuffer.hasRemaining()) {
byte b = mappedBuffer.get();
if (('A' <= b) && (b <= 'Z')) {
wordBuilder.append((char) (b + ('a' - 'A')));
} else if (('a' <= b) && (b <= 'z')) {
wordBuilder.append((char) b);
} else {
if (!wordBuilder.isEmpty()) {
var word = wordBuilder.toString();
var wordStat = wordStats.get(word);
if (wordStat == null) {
wordStat = new WordStat(word);
wordStats.put(word, wordStat);
} else {
wordStat.incCount();
}
wordBuilder.delete(0, wordBuilder.length());
}
}
}
}
var part1Time = System.nanoTime();
System.out.println("Part 1: " + (part1Time - startTime));
var wordStatList = new ArrayList<>(wordStats.values());
wordStatList.sort(Comparator.comparing(WordStat::count).reversed());
var part2Time = System.nanoTime();
System.out.println("Part 2: " + (part2Time - part1Time));
var resultPath = Path.of("tmp\\result.txt");
try (var writer = Files.newBufferedWriter(resultPath)) {
for (WordStat wordStat : wordStatList) {
writer.write(Integer.toString(wordStat.count()));
writer.write(' ');
writer.write(wordStat.word());
writer.newLine();
}
}
var part3Time = System.nanoTime();
System.out.println("Part 3: " + (part3Time - part2Time));
System.out.println("Total: " + (part3Time - startTime));
}
static class WordStat {
private final String word;
private int count;
WordStat(String word) {
this.word = word;
count = 1;
}
String word() {
return word;
}
int count() {
return count;
}
void incCount() {
count++;
}
}
} [/code]
Отрабатывает за
[code] Part 1: 4861061700 Part 2: 62542400 Part 3: 59626800 Total: 4983230900 [/code]
Ну т.е. за 5 секунд примерно.
Сейчас попробую все 4 ядра загрузить. Думаю, секунду-другую есть шансы выиграть.