markov chain generation is pretty decent

This commit is contained in:
Chris Anderson 2008-06-01 15:07:56 -07:00
parent 4be3c5f12c
commit 8ff959a6f7
20 changed files with 25066 additions and 62 deletions

File diff suppressed because it is too large Load diff

34
examples/word_count/markov Executable file
View file

@ -0,0 +1,34 @@
#!/usr/bin/env ruby
require '../../couchrest'
cr = CouchRest.new("http://localhost:5984")
db = cr.database('word-count-example')
word = ARGV[0]
words = [word]
wprobs = {}
while word
$stdout.print ' ' if words.length > 1
$stdout.print word
$stdout.flush
wprobs[word] ||= db.view('markov/chain-reduce', :startkey => [word,nil], :endkey => [word,{}],:group_level => 2)
# puts
# puts "search #{word} #{wprobs[word]['rows'].length}"
# wprobs[word]['rows'].sort_by{|r|r['value']}.each{|r|puts [r['value'],r['key']].inspect}
rows = wprobs[word]['rows'].select{|r|(r['key'][1]!='')}.sort_by{|r|r['value']}
row = rows[(-1*[rows.length,5].min)..-1].sort_by{rand}[0]
word = row ? row['key'][1] : nil
words << word
end
$stdout.print '.'
$stdout.flush
puts
# `say #{words.join(' ')}`

View file

@ -0,0 +1,3 @@
function(doc) {
doc.title && doc.chunk && emit([doc.title, doc.chunk],null);
}

View file

@ -0,0 +1 @@
function(doc){if(doc.text && doc.text.match(/united/)) emit([doc.title, doc.chunk],null)}

View file

@ -0,0 +1,6 @@
function(doc){
var words = doc.text.split(/\W/).filter(function(w) {return w.length > 0}).map(function(w){return w.toLowerCase()});
for (var i = 0, l = words.length; i < l; i++) {
emit(words.slice(i,4),doc.title);
}
}

View file

@ -0,0 +1,7 @@
function(key,vs,c){
if (c) {
return sum(vs);
} else {
return vs.length;
}
}

View file

@ -0,0 +1,6 @@
function(doc){
var words = doc.text.split(/\W/).map(function(w){return w.toLowerCase()});
words.forEach(function(word){
if (word.length > 0) emit([word,doc.title],1);
});
}

View file

@ -0,0 +1,3 @@
function(key,combine){
return sum(combine);
}

View file

@ -2,16 +2,18 @@ require File.dirname(__FILE__) + '/../../couchrest'
couch = CouchRest.new("http://localhost:5984")
db = couch.database('word-count-example')
# db.delete! rescue nil
# db = couch.create_db('word-count-example')
db.delete! rescue nil
db = couch.create_db('word-count-example')
%w{america.txt da-vinci.txt outline-of-science.txt ulysses.txt}.each do |book|
# %w{}.each do |book|
title = book.split('.')[0]
puts title
File.open(File.join(File.dirname(__FILE__),book),'r') do |file|
lines = []
chunk = 0
while line = file.gets
puts chunk
lines << line
if lines.length > 10
db.save({

View file

@ -1,60 +0,0 @@
$: << File.expand_path(File.dirname(__FILE__)) + '/..'
require 'lib/parse'
require 'couchrest/couchrest'
require 'vendor/jsmin/lib/jsmin'
# require 'yaml'
# connect to couchdb
cr = CouchRest.new("http://localhost:5984")
cr.create_db('grabbit-import') rescue nil
db = cr.database('grabbit-import')
# create views from files
views = {}
viewfiles = Dir.glob(File.join(File.expand_path(File.dirname(__FILE__)),"..","views","**","*.js"))
libfiles = viewfiles.select{|f|/lib\.js/.match(f)}
all = (viewfiles-libfiles).collect do |file|
filename = /(\w.*)-(\w.*)\.js/.match file.split('/').pop
filename.to_a + [file]
end
@libfuncs = open(libfiles[0]).read
def readjs(file)
st = open(file).read
st.sub!(/\/\/include-lib/,@libfuncs)
JSMin.minify(st)
end
all.group_by do |f|
f[3].split('/')[-2]
end.each do |design,ps|
views[design] ||= {}
puts "design #{design}"
ps.group_by do |f|
f[1]
end.each do |view,parts|
puts "view #{view}"
views[design]["#{view}-reduce"] ||= {}
parts.each do |p|
puts "part #{p.inspect}"
views[design]["#{view}-reduce"][p[2]] = readjs(p[3])
end
views[design]["#{view}-map"] = {:map => views[design]["#{view}-reduce"]['map']}
views[design].delete("#{view}-reduce") unless views[design]["#{view}-reduce"]['reduce']
end
end
views.each do |design,viewfuncs|
begin
view = db.get("_design/#{design}")
db.delete(view)
rescue
end
db.save({
"_id" => "_design/#{design}",
:views => viewfuncs
})
end

View file

@ -0,0 +1,2 @@
function(doc){if(doc.mp3s){for(var i=0,m;m=doc.mp3s[i];i++){emit(m.href,doc.fetch.url);}}}

View file

@ -0,0 +1,3 @@
function(hrefs,ss){log(ss)
return ss[0];}

View file

@ -0,0 +1,2 @@
function(doc){var fetchurl=doc.fetch&&doc.fetch.url;if(!fetchurl)return;doc.entries&&doc.entries.forEach(function(e){e.mp3s&&e.mp3s.forEach(function(mp3){mp3.href&&emit(mp3.href,fetchurl);});});doc.playlist&&doc.playlist.track&&doc.playlist.track.forEach(function(t){t.location&&t.location.forEach(function(url){emit(url,fetchurl);});});doc.mp3s&&doc.mp3s.forEach(function(mp3){mp3.href&&emit(mp3.href,fetchurl);});}

View file

@ -0,0 +1,2 @@
function(ks,vs){log({keys:ks});log({values:vs});return 1;};

View file

@ -0,0 +1,2 @@
function(doc){doc.playlist&&doc.playlist.track&&doc.playlist.track.forEach(function(t){emit([t.creator||null,t.title||null],t.album||null);});};

View file

@ -0,0 +1,2 @@
function(doc){doc.playlist&&doc.playlist.track&&doc.playlist.track.forEach(function(t){if(t.creator||t.title){if(t.location){t.location.forEach(function(url){emit([t.creator||null,t.title||null],url);});}else{emit([t.creator||null,t.title||null],null);}}});};

View file

@ -0,0 +1,2 @@
function(ks,vs,c){if(c){return null;}else{log(ks[0][0][0]);return ks[0][0][0];}};

View file

@ -0,0 +1,2 @@
function(doc){emit(null,doc);}

View file

@ -0,0 +1,585 @@
/*
JS Beautifier
---------------
$Date: 2008-05-26 06:34:52 +0300 (Mon, 26 May 2008) $
$Revision: 55 $
Written by Einars "elfz" Lielmanis, <elfz@laacz.lv>
http://elfz.laacz.lv/beautify/
Originally converted to javascript by Vital, <vital76@gmail.com>
http://my.opera.com/Vital/blog/2007/11/21/javascript-beautify-on-javascript-translated
You are free to use this in any way you want, in case you find this useful or working for you.
Usage:
js_beautify(js_source_text);
*/
function js_beautify(js_source_text, indent_size, indent_character)
{
var input, output, token_text, last_type, last_text, last_word, current_mode, modes, indent_level, indent_string;
var whitespace, wordchar, punct, parser_pos, line_starters, in_case;
var prefix, token_type, do_block_just_closed, var_line, var_line_tainted;
function trim_output()
{
while (output.length && (output[output.length - 1] === ' ' || output[output.length - 1] === indent_string)) {
output.pop();
}
}
function print_newline(ignore_repeated)
{
ignore_repeated = typeof ignore_repeated === 'undefined' ? true: ignore_repeated;
trim_output();
if (!output.length) {
return; // no newline on start of file
}
if (output[output.length - 1] !== "\n" || !ignore_repeated) {
output.push("\n");
}
for (var i = 0; i < indent_level; i++) {
output.push(indent_string);
}
}
function print_space()
{
var last_output = output.length ? output[output.length - 1] : ' ';
if (last_output !== ' ' && last_output !== '\n' && last_output !== indent_string) { // prevent occassional duplicate space
output.push(' ');
}
}
function print_token()
{
output.push(token_text);
}
function indent()
{
indent_level++;
}
function unindent()
{
if (indent_level) {
indent_level--;
}
}
function remove_indent()
{
if (output.length && output[output.length - 1] === indent_string) {
output.pop();
}
}
function set_mode(mode)
{
modes.push(current_mode);
current_mode = mode;
}
function restore_mode()
{
do_block_just_closed = current_mode === 'DO_BLOCK';
current_mode = modes.pop();
}
function in_array(what, arr)
{
for (var i = 0; i < arr.length; i++)
{
if (arr[i] === what) {
return true;
}
}
return false;
}
function get_next_token()
{
var n_newlines = 0;
var c = '';
do {
if (parser_pos >= input.length) {
return ['', 'TK_EOF'];
}
c = input.charAt(parser_pos);
parser_pos += 1;
if (c === "\n") {
n_newlines += 1;
}
}
while (in_array(c, whitespace));
if (n_newlines > 1) {
for (var i = 0; i < 2; i++) {
print_newline(i === 0);
}
}
var wanted_newline = (n_newlines === 1);
if (in_array(c, wordchar)) {
if (parser_pos < input.length) {
while (in_array(input.charAt(parser_pos), wordchar)) {
c += input.charAt(parser_pos);
parser_pos += 1;
if (parser_pos === input.length) {
break;
}
}
}
// small and surprisingly unugly hack for 1E-10 representation
if (parser_pos !== input.length && c.match(/^[0-9]+[Ee]$/) && input.charAt(parser_pos) === '-') {
parser_pos += 1;
var t = get_next_token(parser_pos);
c += '-' + t[0];
return [c, 'TK_WORD'];
}
if (c === 'in') { // hack for 'in' operator
return [c, 'TK_OPERATOR'];
}
return [c, 'TK_WORD'];
}
if (c === '(' || c === '[') {
return [c, 'TK_START_EXPR'];
}
if (c === ')' || c === ']') {
return [c, 'TK_END_EXPR'];
}
if (c === '{') {
return [c, 'TK_START_BLOCK'];
}
if (c === '}') {
return [c, 'TK_END_BLOCK'];
}
if (c === ';') {
return [c, 'TK_END_COMMAND'];
}
if (c === '/') {
var comment = '';
// peek for comment /* ... */
if (input.charAt(parser_pos) === '*') {
parser_pos += 1;
if (parser_pos < input.length) {
while (! (input.charAt(parser_pos) === '*' && input.charAt(parser_pos + 1) && input.charAt(parser_pos + 1) === '/') && parser_pos < input.length) {
comment += input.charAt(parser_pos);
parser_pos += 1;
if (parser_pos >= input.length) {
break;
}
}
}
parser_pos += 2;
return ['/*' + comment + '*/', 'TK_BLOCK_COMMENT'];
}
// peek for comment // ...
if (input.charAt(parser_pos) === '/') {
comment = c;
while (input.charAt(parser_pos) !== "\x0d" && input.charAt(parser_pos) !== "\x0a") {
comment += input.charAt(parser_pos);
parser_pos += 1;
if (parser_pos >= input.length) {
break;
}
}
parser_pos += 1;
if (wanted_newline) {
print_newline();
}
return [comment, 'TK_COMMENT'];
}
}
if (c === "'" || // string
c === '"' || // string
(c === '/' &&
((last_type === 'TK_WORD' && last_text === 'return') || (last_type === 'TK_START_EXPR' || last_type === 'TK_END_BLOCK' || last_type === 'TK_OPERATOR' || last_type === 'TK_EOF' || last_type === 'TK_END_COMMAND')))) { // regexp
var sep = c;
var esc = false;
c = '';
if (parser_pos < input.length) {
while (esc || input.charAt(parser_pos) !== sep) {
c += input.charAt(parser_pos);
if (!esc) {
esc = input.charAt(parser_pos) === '\\';
} else {
esc = false;
}
parser_pos += 1;
if (parser_pos >= input.length) {
break;
}
}
}
parser_pos += 1;
if (last_type === 'TK_END_COMMAND') {
print_newline();
}
return [sep + c + sep, 'TK_STRING'];
}
if (in_array(c, punct)) {
while (parser_pos < input.length && in_array(c + input.charAt(parser_pos), punct)) {
c += input.charAt(parser_pos);
parser_pos += 1;
if (parser_pos >= input.length) {
break;
}
}
return [c, 'TK_OPERATOR'];
}
return [c, 'TK_UNKNOWN'];
}
//----------------------------------
indent_character = indent_character || ' ';
indent_size = indent_size || 4;
indent_string = '';
while (indent_size--) {
indent_string += indent_character;
}
input = js_source_text;
last_word = ''; // last 'TK_WORD' passed
last_type = 'TK_START_EXPR'; // last token type
last_text = ''; // last token text
output = [];
do_block_just_closed = false;
var_line = false;
var_line_tainted = false;
whitespace = "\n\r\t ".split('');
wordchar = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_$'.split('');
punct = '+ - * / % & ++ -- = += -= *= /= %= == === != !== > < >= <= >> << >>> >>>= >>= <<= && &= | || ! !! , : ? ^ ^= |='.split(' ');
// words which should always start on new line.
line_starters = 'continue,try,throw,return,var,if,switch,case,default,for,while,break,function'.split(',');
// states showing if we are currently in expression (i.e. "if" case) - 'EXPRESSION', or in usual block (like, procedure), 'BLOCK'.
// some formatting depends on that.
current_mode = 'BLOCK';
modes = [current_mode];
indent_level = 0;
parser_pos = 0; // parser position
in_case = false; // flag for parser that case/default has been processed, and next colon needs special attention
while (true) {
var t = get_next_token(parser_pos);
token_text = t[0];
token_type = t[1];
if (token_type === 'TK_EOF') {
break;
}
switch (token_type) {
case 'TK_START_EXPR':
var_line = false;
set_mode('EXPRESSION');
if (last_type === 'TK_END_EXPR' || last_type === 'TK_START_EXPR') {
// do nothing on (( and )( and ][ and ]( ..
} else if (last_type !== 'TK_WORD' && last_type !== 'TK_OPERATOR') {
print_space();
} else if (in_array(last_word, line_starters) && last_word !== 'function') {
print_space();
}
print_token();
break;
case 'TK_END_EXPR':
print_token();
restore_mode();
break;
case 'TK_START_BLOCK':
if (last_word === 'do') {
set_mode('DO_BLOCK');
} else {
set_mode('BLOCK');
}
if (last_type !== 'TK_OPERATOR' && last_type !== 'TK_START_EXPR') {
if (last_type === 'TK_START_BLOCK') {
print_newline();
} else {
print_space();
}
}
print_token();
indent();
break;
case 'TK_END_BLOCK':
if (last_type === 'TK_START_BLOCK') {
// nothing
trim_output();
unindent();
} else {
unindent();
print_newline();
}
print_token();
restore_mode();
break;
case 'TK_WORD':
if (do_block_just_closed) {
print_space();
print_token();
print_space();
break;
}
if (token_text === 'case' || token_text === 'default') {
if (last_text === ':') {
// switch cases following one another
remove_indent();
} else {
// case statement starts in the same line where switch
unindent();
print_newline();
indent();
}
print_token();
in_case = true;
break;
}
prefix = 'NONE';
if (last_type === 'TK_END_BLOCK') {
if (!in_array(token_text.toLowerCase(), ['else', 'catch', 'finally'])) {
prefix = 'NEWLINE';
} else {
prefix = 'SPACE';
print_space();
}
} else if (last_type === 'TK_END_COMMAND' && (current_mode === 'BLOCK' || current_mode === 'DO_BLOCK')) {
prefix = 'NEWLINE';
} else if (last_type === 'TK_END_COMMAND' && current_mode === 'EXPRESSION') {
prefix = 'SPACE';
} else if (last_type === 'TK_WORD') {
prefix = 'SPACE';
} else if (last_type === 'TK_START_BLOCK') {
prefix = 'NEWLINE';
} else if (last_type === 'TK_END_EXPR') {
print_space();
prefix = 'NEWLINE';
}
if (last_type !== 'TK_END_BLOCK' && in_array(token_text.toLowerCase(), ['else', 'catch', 'finally'])) {
print_newline();
} else if (in_array(token_text, line_starters) || prefix === 'NEWLINE') {
if (last_text === 'else') {
// no need to force newline on else break
print_space();
} else if ((last_type === 'TK_START_EXPR' || last_text === '=') && token_text === 'function') {
// no need to force newline on 'function': (function
// DONOTHING
} else if (last_type === 'TK_WORD' && (last_text === 'return' || last_text === 'throw')) {
// no newline between 'return nnn'
print_space();
} else if (last_type !== 'TK_END_EXPR') {
if ((last_type !== 'TK_START_EXPR' || token_text !== 'var') && last_text !== ':') {
// no need to force newline on 'var': for (var x = 0...)
if (token_text === 'if' && last_type === 'TK_WORD' && last_word === 'else') {
// no newline for } else if {
print_space();
} else {
print_newline();
}
}
}
} else if (prefix === 'SPACE') {
print_space();
}
print_token();
last_word = token_text;
if (token_text === 'var') {
var_line = true;
var_line_tainted = false;
}
break;
case 'TK_END_COMMAND':
print_token();
var_line = false;
break;
case 'TK_STRING':
if (last_type === 'TK_START_BLOCK' || last_type === 'TK_END_BLOCK') {
print_newline();
} else if (last_type === 'TK_WORD') {
print_space();
}
print_token();
break;
case 'TK_OPERATOR':
var start_delim = true;
var end_delim = true;
if (var_line && token_text !== ',') {
var_line_tainted = true;
if (token_text === ':') {
var_line = false;
}
}
if (token_text === ':' && in_case) {
print_token(); // colon really asks for separate treatment
print_newline();
break;
}
in_case = false;
if (token_text === ',') {
if (var_line) {
if (var_line_tainted) {
print_token();
print_newline();
var_line_tainted = false;
} else {
print_token();
print_space();
}
} else if (last_type === 'TK_END_BLOCK') {
print_token();
print_newline();
} else {
if (current_mode === 'BLOCK') {
print_token();
print_newline();
} else {
// EXPR od DO_BLOCK
print_token();
print_space();
}
}
break;
} else if (token_text === '--' || token_text === '++') { // unary operators special case
if (last_text === ';') {
// space for (;; ++i)
start_delim = true;
end_delim = false;
} else {
start_delim = false;
end_delim = false;
}
} else if (token_text === '!' && last_type === 'TK_START_EXPR') {
// special case handling: if (!a)
start_delim = false;
end_delim = false;
} else if (last_type === 'TK_OPERATOR') {
start_delim = false;
end_delim = false;
} else if (last_type === 'TK_END_EXPR') {
start_delim = true;
end_delim = true;
} else if (token_text === '.') {
// decimal digits or object.property
start_delim = false;
end_delim = false;
} else if (token_text === ':') {
// zz: xx
// can't differentiate ternary op, so for now it's a ? b: c; without space before colon
if (last_text.match(/^\d+$/)) {
// a little help for ternary a ? 1 : 0;
start_delim = true;
} else {
start_delim = false;
}
}
if (start_delim) {
print_space();
}
print_token();
if (end_delim) {
print_space();
}
break;
case 'TK_BLOCK_COMMENT':
print_newline();
print_token();
print_newline();
break;
case 'TK_COMMENT':
// print_newline();
print_space();
print_token();
print_newline();
break;
case 'TK_UNKNOWN':
print_token();
break;
}
last_type = token_type;
last_text = token_text;
}
return output.join('');
}

View file

@ -0,0 +1,2 @@
function(ks,vs){return vs.length;}