Experiments into markov chains, n-grams, and text generation.

download.sh 4.2KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #!/usr/bin/env bash
  2. ### MediaWiki helpers ###
  3. # Iterates over all the pages in a category.
  4. # Handles continues (multi-request page lists) correctly.
  5. # $1 - Wiki api.php root
  6. # $2 - Category name
  7. function iterate_category() {
  8. # Example url: https://starbounder.org/mediawiki/api.php?action=query&format=json&list=categorymembers&cmlimit=max&cmtitle=Category:Craftables
  9. initial_url="$1?action=query&format=json&list=categorymembers&cmlimit=max&cmtitle=$2";
  10. next_url="${initial_url}";
  11. temp_file="$(mktemp --suffix .CategoryDownloaded.json)";
  12. echo "Iterating over category $2 at $1" >&2;
  13. echo "Temporary file is at ${temp_file}" >&2;
  14. while :; do
  15. echo "Fetching ${next_url}" >&2;
  16. curl -sS -o "${temp_file}" "${next_url}";
  17. jq --raw-output '.query.categorymembers[].title' <"${temp_file}"
  18. # If there's no continue object in the response, then we must be done
  19. if [ "$(cat "${temp_file}" | jq --raw-output '.continue' --monochrome-output)" == "null" ]; then
  20. echo "Category iteration complete" >&2;
  21. break
  22. fi
  23. continue_code="$(cat "${temp_file}" | jq --raw-output '.continue.cmcontinue')";
  24. next_url="${initial_url}&cmcontinue=${continue_code}";
  25. done
  26. }
  27. ### CrossCode ###
  28. curl https://crosscode.gamepedia.com/Items | xidel --data - --css "table a" | awk "NF > 0" | sort >Cross-Code-Items.txt
  29. ### Final Fantasy XV ###
  30. curl 'http://finalfantasy.wikia.com/wiki/Treasures_(Final_Fantasy_XV)' | xidel --data - --css "table th.b[rowspan=4]" >Final-Fantasy-15-Items.txt
  31. curl 'http://finalfantasy.wikia.com/wiki/Ingredients' | xidel --data - --css "table th.b[rowspan=4]" | sed -e 's/(.*$//g' | sort >>Final-Fantasy-15-Items.txt
  32. curl 'http://finalfantasy.wikia.com/wiki/Auto_Parts' | xidel --data - --css "table th.b[rowspan=2]" | sort >>Final-Fantasy-15-Items.txt
  33. curl 'http://finalfantasy.wikia.com/wiki/Leisure_Goods' | xidel --data - --css "table.article-table tr:not(.a) th.b" | sort >>Final-Fantasy-15-Items.txt
  34. curl 'http://finalfantasy.wikia.com/wiki/Key_Items_(Final_Fantasy_XV)' | xidel --data - --css "table.article-table tr:not(.a) th.b" | sort >>Final-Fantasy-15-Items.txt
  35. curl 'http://finalfantasy.wikia.com/wiki/List_of_Final_Fantasy_XV_items' | xidel --data - --css "table.article-table tr:not(.a) th.b" | sed -e 's/(.*$//g' | sort >>Final-Fantasy-15-Items.txt
  36. curl 'http://finalfantasy.wikia.com/wiki/List_of_Final_Fantasy_XV_accessories' | xidel --data - --css "table.article-table tr:not(.a) th.b" | sed -e 's/(.*$//g' | sort >>Final-Fantasy-15-Items.txt
  37. sort Final-Fantasy-15-Items.txt -o Final-Fantasy-15-Items.txt
  38. ### No Man's Sky ###
  39. curl "http://orcz.com/No_Man's_Sky:_Items_List" | xidel --data - --css "table td:first-child a, #mw-content-text > ul > li" | sed -e 's/\s*—.*$//g' | sort >No-Mans-Sky-Items.txt
  40. ### Stardew Valley ###
  41. curl https://stardewids.com/ | xidel --data - --css "td.ts a" | sort >Stardew-Valley-Items.txt
  42. # --no-split --lowercase --order 4 --length 12
  43. # --no-split --start-uppercase --order 4 --length 12
  44. # --no-split --start-uppercase --order 3 --length 12
  45. ### Recipes Wikia ###
  46. curl http://recipes.wikia.com/sitemap-newsitemapxml-index.xml | xidel --data - --css "loc" | grep -i NS_0 | xargs -n1 -I{} sh -c 'curl {} | xidel --data - --css "loc"' | sed -e 's/^.*\///g' -e 's/_/ /g' | python -c "import urllib, sys; print urllib.unquote(sys.argv[1] if len(sys.argv) > 1 else sys.stdin.read()[0:-1])" | grep -iv "Nutrient" | sort >Dishes.txt
  47. ### Rise of Berk Dragons list ###
  48. function list_pages() {
  49. curl "http://riseofberk.wikia.com/api.php?action=query&generator=categorymembers&gcmtitle=${1}&cllimt=max&gcmlimit=max&format=json" | jq --raw-output '.query.pages[].title' | grep -iv Category:
  50. }
  51. function list_categories() {
  52. curl "http://riseofberk.wikia.com/api.php?action=query&generator=categorymembers&gcmtitle=${1}&cllimt=max&gcmlimit=max&format=json" | jq --raw-output '.query.pages[].title' | grep -i Category:
  53. }
  54. list_pages "Category:Dragons" >Dragons.txt
  55. ### Starbound ###
  56. curl https://starbounder.org/Blocks | xidel --data - --css ".gametable a" | awk '/[^.].*/' | sort | uniq >Starbound.txt
  57. iterate_category "https://starbounder.org/mediawiki/api.php" "Category:Craftables" >>Starbound.txt;
  58. ### Shakespeare's Complete Works ###
  59. curl https://www.gutenberg.org/files/100/100-0.txt >Shakespeares-Works.txt