I am struggling to find/create an algorithm that can determine the pronounceability of random 5 letter combinations.
The closest thing I\'ve found so far is from thi
How about generating a reasonably pronounceable combination from the start? I have done something where I generate a random Soundex code, and work back from that to a (usually) pronounceable original.
Based on a suggestion on the linked question to "Use a Markov model on letters"
Use a Markov model (on letters, not words, of course). The probability of a word is a pretty good proxy for ease of pronunciation.
I thought I would try it out and had some success.
I copied a list of real 5-letter words into a file to serve as my dataset (here...um, actually here).
Then I use a Hidden Markov model (based on One-grams, Bi-grams, and Tri-grams) to predict how likely a target word would appear in that dataset.
(Better results could be achieved with some sort of phonetic transcription as one of the steps.)
First, I calculate the probabilities of character sequences in the dataset.
For example, if 'A' occurs 50 times, and there is only 250 characters in the dataset, then 'A' has a 50/250 or .2 probability.
Do the same for the bigrams 'AB', 'AC', ...
Do the same for the trigrams 'ABC', 'ABD', ...
Basically, my score for the word "ABCDE" is composed of:
You could multiply all of these together to get the estimated probability of the target word appearing in the dataset, (but that is very small).
So instead, we take the logs of each and add them together.
Now we have a score which estimates how likely our target word would appear in the dataset.
I have coded this is C#, and find that a score greater than negative 160 is pretty good.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
namespace Pronouncability
{
class Program
{
public static char[] alphabet = new char[]{ 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z' };
public static List<string> wordList = loadWordList(); //Dataset of 5-letter words
public static Random rand = new Random();
public const double SCORE_LIMIT = -160.00;
/// <summary>
/// Generates random words, until 100 of them are better than
/// the SCORE_LIMIT based on a statistical score.
/// </summary>
public static void Main(string[] args)
{
Dictionary<Tuple<char, char, char>, int> trigramCounts = new Dictionary<Tuple<char, char, char>, int>();
Dictionary<Tuple<char, char>, int> bigramCounts = new Dictionary<Tuple<char, char>, int>();
Dictionary<char, int> onegramCounts = new Dictionary<char, int>();
calculateProbabilities(onegramCounts, bigramCounts, trigramCounts);
double totalTrigrams = (double)trigramCounts.Values.Sum();
double totalBigrams = (double)bigramCounts.Values.Sum();
double totalOnegrams = (double)onegramCounts.Values.Sum();
SortedList<double, string> randomWordsScores = new SortedList<double, string>();
while( randomWordsScores.Count < 100 )
{
string randStr = getRandomWord();
if (!randomWordsScores.ContainsValue(randStr))
{
double score = getLikelyhood(randStr,trigramCounts, bigramCounts, onegramCounts, totalTrigrams, totalBigrams, totalOnegrams);
if (score > SCORE_LIMIT)
{
randomWordsScores.Add(score, randStr);
}
}
}
//Right now randomWordsScores contains 100 random words which have
//a better score than the SCORE_LIMIT, sorted from worst to best.
}
/// <summary>
/// Generates a random 5-letter word
/// </summary>
public static string getRandomWord()
{
char c0 = (char)rand.Next(65, 90);
char c1 = (char)rand.Next(65, 90);
char c2 = (char)rand.Next(65, 90);
char c3 = (char)rand.Next(65, 90);
char c4 = (char)rand.Next(65, 90);
return "" + c0 + c1 + c2 + c3 + c4;
}
/// <summary>
/// Returns a score for how likely a given word is, based on given trigrams, bigrams, and one-grams
/// </summary>
public static double getLikelyhood(string wordToScore, Dictionary<Tuple<char, char,char>, int> trigramCounts, Dictionary<Tuple<char, char>, int> bigramCounts, Dictionary<char, int> onegramCounts, double totalTrigrams, double totalBigrams, double totalOnegrams)
{
wordToScore = wordToScore.ToUpper();
char[] letters = wordToScore.ToCharArray();
Tuple<char, char>[] bigrams = new Tuple<char, char>[]{
new Tuple<char,char>( wordToScore[0], wordToScore[1] ),
new Tuple<char,char>( wordToScore[1], wordToScore[2] ),
new Tuple<char,char>( wordToScore[2], wordToScore[3] ),
new Tuple<char,char>( wordToScore[3], wordToScore[4] )
};
Tuple<char, char, char>[] trigrams = new Tuple<char, char, char>[]{
new Tuple<char,char,char>( wordToScore[0], wordToScore[1], wordToScore[2] ),
new Tuple<char,char,char>( wordToScore[1], wordToScore[2], wordToScore[3] ),
new Tuple<char,char,char>( wordToScore[2], wordToScore[3], wordToScore[4] ),
};
double score = 0;
foreach (char c in letters)
{
score += Math.Log((((double)onegramCounts[c]) / totalOnegrams));
}
foreach (Tuple<char, char> pair in bigrams)
{
score += Math.Log((((double)bigramCounts[pair]) / totalBigrams));
}
foreach (Tuple<char, char, char> trio in trigrams)
{
score += 5.0*Math.Log((((double)trigramCounts[trio]) / totalTrigrams));
}
return score;
}
/// <summary>
/// Build the probability tables based on the dataset (WordList)
/// </summary>
public static void calculateProbabilities(Dictionary<char, int> onegramCounts, Dictionary<Tuple<char, char>, int> bigramCounts, Dictionary<Tuple<char, char, char>, int> trigramCounts)
{
foreach (char c1 in alphabet)
{
foreach (char c2 in alphabet)
{
foreach( char c3 in alphabet)
{
trigramCounts[new Tuple<char, char, char>(c1, c2, c3)] = 1;
}
}
}
foreach( char c1 in alphabet)
{
foreach( char c2 in alphabet)
{
bigramCounts[ new Tuple<char,char>(c1,c2) ] = 1;
}
}
foreach (char c1 in alphabet)
{
onegramCounts[c1] = 1;
}
foreach (string word in wordList)
{
for (int pos = 0; pos < 3; pos++)
{
trigramCounts[new Tuple<char, char, char>(word[pos], word[pos + 1], word[pos + 2])]++;
}
for (int pos = 0; pos < 4; pos++)
{
bigramCounts[new Tuple<char, char>(word[pos], word[pos + 1])]++;
}
for (int pos = 0; pos < 5; pos++)
{
onegramCounts[word[pos]]++;
}
}
}
/// <summary>
/// Get the dataset (WordList) from file.
/// </summary>
public static List<string> loadWordList()
{
string filePath = "WordList.txt";
string text = File.ReadAllText(filePath);
List<string> result = text.Split(' ').ToList();
return result;
}
}
}
In my example, I scale the trigram probabilities by 5.
I also add one to all of the counts, so we don't multiply by zero.
I'm not a php programmer, but the technique is pretty easy to implement.
Play around with some scaling factors, try different datasets, or add in some other checks like what you suggested above.