=====Overview=====
Sometime ago I had to pass a coding interview for one company and they gave me the following challange:
Your challenge, should you choose to accept it, is to write a program that meets the requirements listed below. Feel free to implement the program in the language of your choice (Go, Ruby, or Python are preferred if you're choosing between languages).
The program accepts as arguments a list of one or more file paths (e.g. ./solution.rb file1.txt file2.txt ...).
The program also accepts input on stdin (e.g. cat file1.txt | ./solution.rb).
The program outputs a list of the 100 most common three word sequences in the text, along with a count of how many times each occurred in the text. For example: 231 - i will not, 116 - i do not, 105 - there is no, 54 - i know not, 37 - i am not …
The program ignores punctuation, line endings, and is case insensitive (e.g. “I love\nsandwiches.” should be treated the same as "(I LOVE SANDWICHES!!)")
The program is capable of processing large files and runs as fast as possible.
The program should be tested. Provide a test file for your solution.
So, set your own pace. No rush. And just so you have some bounds, the challenge was designed to take somewhere between 2 and 3 hours.
You could try it against "Origin Of Species" as a test: http://www.gutenberg.org/cache/epub/2009/pg2009.txt .
So, we have the definition, I have done couple solutiosn, but let's see the one which I've sent:
=====Solution=====
The solution can be done using Sync and Async way, so let's check each:
====Sync====
The sync way, is little slower, by slower I mean (0.00000001) seconds :)
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection.Metadata;
using System.Text;
namespace ChallengeNewRelic
{
class Program
{
static void Main(string[] args)
{
StringBuilder currentSequence = new StringBuilder();
StringBuilder currentWord = new StringBuilder();
Queue currentSequenceQueue = new Queue();
Dictionary sequences = new Dictionary();
int currentChunk;
char currentChar;
string dir = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);
foreach (var file in args)
{
using (var fs =
BufferedStream(dir + @"\\" + file.ToString()))
{
byte[] buffer = new byte[4096];
long bytesRead;
while ((currentChunk = fs.Read(buffer,0,buffer.Length)) > 0)
{
for (int i = 0; i < buffer.Length; i++)
{
currentChar = Char.ToLowerInvariant(Convert.ToChar(buffer[i]));
if (char.IsLetterOrDigit(currentChar))
{
currentWord.Append(currentChar);
}
if (((char.IsPunctuation(currentChar) || char.IsSeparator(currentChar) ||
char.IsWhiteSpace(currentChar)) || fs.Position == fs.Length) &&
currentWord.ToString() != "")
{
currentSequenceQueue.Enqueue(currentWord.ToString());
currentWord.Clear();
if (currentSequenceQueue.Count < 3)
{
continue;
}
currentSequence.Append(string.Join(" ", currentSequenceQueue));
if (!sequences.ContainsKey(currentSequence.ToString()))
{
sequences[currentSequence.ToString()] = 1;
}
else
{
sequences[currentSequence.ToString()]++;
}
currentSequenceQueue.Dequeue();
currentSequence.Clear();
}
}
}
}
}
foreach (var sequence in sequences.OrderByDescending(x => x.Value).Take(100))
{
Console.WriteLine($"{sequence.Key} - {sequence.Value}");
}
}
}
}
====Async====
The Async way, was to show that i Know how to write Async programs (I had to explain why I decided to send them the Async instead of the sync, but anyway)
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace ChallengeNewRelicV3
{
class NewRelicCHallenge
{
public static ConcurrentDictionary sequences = new ConcurrentDictionary();
public static async Task Main(string[] args)
{
string dir = System.IO.Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location);
try
{
if (args.Length > 0)
{
Parallel.ForEach(args, async (file) =>
{
{
using (var str = File.OpenText(dir + @"\\" + file.ToString()))
{
await ProcessStream(str);
}
}
});
}
else
{
using (var stdStream = Console.In)
{
await ProcessStream(stdStream);
}
}
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
foreach (var sequence in sequences.OrderByDescending(x => x.Value).Take(100))
{
Console.WriteLine($"{sequence.Key} - {sequence.Value}");
}
}
private async static Task ProcessStream(TextReader str)
{
string currentLine = string.Empty;
List currentLineList = new List();
Queue currentSequenceQueue = new Queue();
//Dictionary sequences = new Dictionary();
while ((currentLine = str.ReadLine()) != null)
{
currentLineList = Regex.Replace(currentLine, @"[^A-Za-z0-9]+", " ")
.Split(" ", StringSplitOptions.RemoveEmptyEntries).ToList();
foreach (var word in currentLineList)
{
currentSequenceQueue.Enqueue(word);
if (currentSequenceQueue.Count < 3)
{
continue;
}
string currentSequence = String.Join(" ", currentSequenceQueue).ToLower();
if (!sequences.ContainsKey(currentSequence))
{
sequences[currentSequence] = 1;
}
else
{
sequences[currentSequence]++;
}
currentSequenceQueue.Dequeue();
}
}
}
}
}