From 9e865cbe6cca6ea3e4e0229020d74c74d5564ccd Mon Sep 17 00:00:00 2001 From: William Heng Date: Sat, 27 May 2017 21:15:10 +0100 Subject: [PATCH] Initial commit --- .gitignore | 106 +++++++++++++++ elementType.go | 36 +++++ main.go | 356 +++++++++++++++++++++++++++++++++++++++++++++++++ parser.go | 203 ++++++++++++++++++++++++++++ 4 files changed, 701 insertions(+) create mode 100644 .gitignore create mode 100644 elementType.go create mode 100644 main.go create mode 100644 parser.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6b06e94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,106 @@ + +# Created by https://www.gitignore.io/api/go,macos,intellij+iml + +### Go ### +# Binaries for programs and plugins +*.exe +*.dll +*.so +*.dylib + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 +.glide/ + +### Intellij+iml ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +### Intellij+iml Patch ### +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + +*.iml +modules.xml +.idea/misc.xml +*.ipr + +### macOS ### +*.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +# End of https://www.gitignore.io/api/go,macos,intellij+iml diff --git a/elementType.go b/elementType.go new file mode 100644 index 0000000..080750d --- /dev/null +++ b/elementType.go @@ -0,0 +1,36 @@ +package main + +type ElementType int + +const ( + NotFound ElementType = iota + Class + Method + Field + Constructor + Interface + Exception + Error + Enum + Trait + Notation + Package +) + +var VALUES = map[ElementType]string{ + Class: "Class", + Method: "Method", + Field: "Field", + Constructor: "Constructor", + Interface: "Interface", + Exception: "Exception", + Error: "Error", + Enum: "Enum", + Trait: "Trait", + Notation: "Notation", + Package: "Package", +} + +func (e *ElementType) value() string { + return VALUES[*e] +} diff --git a/main.go b/main.go new file mode 100644 index 0000000..64a1e66 --- /dev/null +++ b/main.go @@ -0,0 +1,356 @@ +package main + +import ( + "os" + "path" + "github.com/inconshreveable/log15" + "path/filepath" + "errors" + "strings" + "io" + "database/sql" + _ "github.com/mattn/go-sqlite3" + "fmt" +) + +const OVERVIEW_SUMMARY = "overview-summary.html" + +var log = log15.New() + +var toIndex []string + +func main() { + + arguments := os.Args + argLength := len(arguments) + if (argLength == 2 && arguments[1] == "--help") { + printUsage() + return + } else if (argLength != 3) { + log.Error("Invalid argument(s) provided") + printUsage() + os.Exit(1) + } + + docsetName := path.Clean(arguments[1]) + var javadocPath = path.Clean(arguments[2]) + + log.Info("Running with arguments", "docsetName", docsetName, "javadocPath", javadocPath) + + docsetDirectoryPath := docsetName + ".docset" + + if exists, _ := pathExists(docsetDirectoryPath); exists { + log.Info("Removing existing docset directory", "Docset directory path", docsetDirectoryPath) + if err := os.RemoveAll(docsetDirectoryPath); err != nil { + log.Error( + "Unable to remove existing docset directory", + "Docset directory path", docsetDirectoryPath, + "error", err, + ) + os.Exit(1) + } + } + + contentsDirectoryPath := docsetDirectoryPath + "/Contents" + resourcesDirectoryPath := contentsDirectoryPath + "/Resources" + documentsDirectoryPath := resourcesDirectoryPath + "/Documents" + + log.Info("Creating docset folder structure...") + if err := os.MkdirAll(documentsDirectoryPath, os.ModePerm); err != nil { + log.Error("Unable to create docset folder structure", "Docset directory", docsetDirectoryPath) + os.Exit(1) + } + + var docsetIndexFile string + overviewSummaryPath := javadocPath + OVERVIEW_SUMMARY + var summaryFound = false + + if exists, _ := pathExists(overviewSummaryPath); !exists { + + walkCount := 0 + filepath.Walk(javadocPath, func(filePath string, info os.FileInfo, err error) error { + + if err != nil { + log.Error("Failed to walk path", "path", filePath, "err", err) + os.Exit(1) + } + + walkCount++ + + if walkCount < 10000 { + + if info.Name() == OVERVIEW_SUMMARY { + javadocPath = path.Dir(filePath) + summaryFound = true + } + + return nil + } else { + return errors.New("Hit file enumeration limit") + } + }) + } else { + summaryFound = true + } + + if summaryFound { + docsetIndexFile = OVERVIEW_SUMMARY + } + + hasMultipleIndices := false + indexFilesPath := javadocPath + "index-files" + if exists, _ := pathExists(indexFilesPath); exists { + if docsetIndexFile == "" { + docsetIndexFile = "index-files/index-1.html" + } + hasMultipleIndices = true + } + log.Info("Done!") + + copyFiles(documentsDirectoryPath, javadocPath) + + documentsDirectoryIndex := documentsDirectoryPath + "/index-all.html" + if exists, _ := pathExists(documentsDirectoryIndex); !hasMultipleIndices && exists { + + toIndex = append(toIndex, documentsDirectoryIndex) + + if docsetIndexFile == "" { + docsetIndexFile = "index-all.html" + } + + } else { + + indexFilesPath := documentsDirectoryPath + "/index-files" + filepath.Walk(indexFilesPath, func(filePath string, info os.FileInfo, err error) error { + + if err != nil { + log.Error("Failed to walk path", "filePath", filePath, "err", err) + os.Exit(1) + } + + filename := info.Name() + if strings.HasPrefix(filename, "index-") && strings.HasSuffix(filename, ".html") { + toIndex = append(toIndex, filePath) + } + return err + }) + + } + + if len(toIndex) == 0 { + log.Error("API folder specified does not contain any index files (either an 'index-all.html' file or an 'index-files' folder and is not valid") + printUsage() + return + } + + writeInfoPlist(docsetName, docsetIndexFile, contentsDirectoryPath) + + initDB(resourcesDirectoryPath, index(toIndex)) +} + +func printUsage() { + log.Info("Usage: javadocset ") + log.Info(" - anything you want") + log.Info(" - the path of the javadoc API folder you want to index") +} + +func copyFiles(documentsDirectoryPath, javadocPath string) { + log.Info("Copying files...", "source", javadocPath, "destination", documentsDirectoryPath) + + src := path.Clean(javadocPath) + dst := path.Clean(documentsDirectoryPath) + + srcBase := path.Base(src) + filepath.Walk(src, func(filePath string, info os.FileInfo, err error) error { + + if err != nil { + log.Error("Error walking path", "filePath", filePath) + os.Exit(1) + } + + if info.IsDir() { + + if path.Base(filePath) != srcBase { + + // We only want to copy the directories within the source directory + // to the destination directory + directoryName := strings.Split(filePath, srcBase)[1] + + err := os.MkdirAll(dst + directoryName, os.ModePerm) + + if err != nil { + log.Error("Unable to create directory", "directory", directoryName) + os.Exit(1) + } + } + + } else { + + // Copy file + fileName := filepath.Base(filePath) + directoryName := strings.Split(filepath.Dir(filePath), srcBase)[1] + + dstPath := filepath.Clean(dst + directoryName + "/" + fileName) + + err = copyFileContents(filePath, dstPath) + + if err != nil { + log.Error("Unable to copy file", "src", filePath, "dst", dstPath) + os.Exit(1) + } + } + + return err + }) + + log.Info("Done!") +} + +func writeInfoPlist(docsetName, docsetIndexFile, contentsDirectoryPath string) { + plistContentTemplate := "CFBundleIdentifier %vCFBundleName %v DocSetPlatformFamily %v dashIndexFilePath%vDashDocSetFamilyjavaisDashDocset" + + docsetIdentifier := firstPhraseLowerCased(docsetName) + + plistContent := fmt.Sprintf( + plistContentTemplate, + docsetIdentifier, + docsetName, + docsetIdentifier, + docsetIndexFile, + ) + + infoPlistPath := contentsDirectoryPath + "/Info.plist" + err := writeStringToFile(plistContent, infoPlistPath) + if err != nil { + log.Error("Unable to write to plist file", "plistPath", infoPlistPath) + } +} + +func initDB(resourcesDirectoryPath string, dbFunc func(*sql.DB)) { + + dbPath := filepath.Clean(resourcesDirectoryPath + "/docset.dsidx") + + // We don't care, we just want to remove the index + os.Remove(dbPath) + + db, err := sql.Open("sqlite3", dbPath) + + if err != nil { + log.Error("Unable to create sqlite database", "destination", dbPath, "error", err) + os.Exit(1) + } + defer db.Close() + + _, err = db.Exec("CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT)") + if err != nil { + log.Error("Unable to create table", "error", err) + os.Exit(1) + } + + if dbFunc != nil { + dbFunc(db) + } +} + +func index(indicesToIndex []string) func(db *sql.DB) { + return func(db *sql.DB) { + + tx, err := db.Begin() + if err != nil { + log.Error("Unable to begin transactions for database", "error", err) + os.Exit(1) + } + + stmt, err := tx.Prepare("INSERT INTO searchIndex(name, type, path) VALUES (?, ?, ?)") + if err != nil { + log.Error("Unable to create statement to insert into database", "error", err) + os.Exit(1) + } + defer stmt.Close() + + added := make(map[string]bool) + + for _, toIndex := range indicesToIndex { + parseIndex(toIndex, func(entry IndexEntry) { + + name, elementType, path := entry.name, entry.elementType.value(), entry.path + + uniqueKey := name + elementType + path + + if !added[uniqueKey] { + _, err := stmt.Exec(name, elementType, path) + if err != nil { + log.Error( + "Unable to insert entry", + "name", name, + "elementType", elementType, + "path", path, + ) + os.Exit(1) + } + + added[uniqueKey] = true + } + }) + } + + tx.Commit() + } +} + + +/** +Utility functions + */ +func pathExists(path string) (bool, error) { + _, err := os.Stat(path) + if err == nil { + return true, nil + } + if os.IsNotExist(err) { + return false, nil + } + return true, err +} + +func writeStringToFile(content, dst string) error { + file, err := os.Create(dst) + if err != nil { + return err + } + defer file.Close() + + _, err = file.Write([]byte(content)) + return err +} + +func copyFileContents(src, dst string) (err error) { + + in, err := os.Open(src) + if err != nil { + return + } + defer in.Close() + out, err := os.Create(dst) + if err != nil { + return + } + defer func() { + cerr := out.Close() + if err == nil { + err = cerr + } + }() + if _, err = io.Copy(out, in); err != nil { + log.Error("Error copying", "error", err) + return + } + err = out.Sync() + return +} + +func firstPhraseLowerCased(s string) string { + return strings.ToLower(func() string { + return strings.Split(s, " ")[0] + }()) +} diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..38e09ea --- /dev/null +++ b/parser.go @@ -0,0 +1,203 @@ +package main + +import ( + "os" + "github.com/yhat/scrape" + "strings" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +type Verifier func(string) bool +type TypeEvaluator func(Verifier, Verifier) bool +type IndexEntry struct { + name string + elementType ElementType + path string +} + +var ALL_ELEMENT_TYPES = []ElementType{ + Class, + Method, + Field, + Constructor, + Interface, + Exception, + Error, + Enum, + Trait, + Notation, + Package, +} + +var ELEMENT_TYPE_TO_TYPE_EVALUATORS = map[ElementType][]TypeEvaluator{ + Class: NewTypeEvaluators(isClass), + Method: NewTypeEvaluators(isStaticMethod, isMethod), + Field: NewTypeEvaluators(isStaticField, isField), + Constructor: NewTypeEvaluators(isConstructor), + Interface: NewTypeEvaluators(isInterface), + Exception: NewTypeEvaluators(isException), + Error: NewTypeEvaluators(isError), + Enum: NewTypeEvaluators(isEnum), + Trait: NewTypeEvaluators(isTrait), + Notation: NewTypeEvaluators(isNotation), + Package: NewTypeEvaluators(isPackage), +} + +func parseIndex(indexFilePath string, entryHandler func(IndexEntry)) { + + log.Info("Indexing from file", "file", indexFilePath) + + indexed := 0 + file, err := os.OpenFile(indexFilePath, os.O_RDONLY, 0666) + + if err != nil { + log.Error("Unable to open file", "file", indexFilePath, "error", err) + return + } + + root, err := html.Parse(file) + + if err != nil { + log.Error("Unable to parse index", "file", file, "error", err) + return + } + + anchorTags := scrape.FindAll(root, scrape.ByTag(atom.A)) + + for _, tag := range anchorTags { + var parentTag = tag.Parent + + if parentTag.FirstChild != tag { + continue + } + + isParentSpan := parentTag.DataAtom == atom.Span + isParentCode := parentTag.DataAtom == atom.Code + isParentItalic := parentTag.DataAtom == atom.I + isParentBold := parentTag.DataAtom == atom.B + + if isParentSpan || isParentCode || isParentItalic || isParentBold { + parentTag = parentTag.Parent + if parentTag.FirstChild != tag.Parent { + continue + } + } + + if parentTag.DataAtom != atom.Dt { + continue + } + + text := scrape.Text(parentTag) + var tagType ElementType = NotFound + var dtClassName = scrape.Attr(parentTag, "class") + + lowercaseText := strings.ToLower(text) + + textContainsInsensitive := func(s string) bool { + return strings.Contains(lowercaseText, s) + } + + dtClassNameHasSuffix := func(s string) bool { + return strings.HasSuffix(dtClassName, s) + } + + tagTypeDetermined := false + + for _, elementType := range ALL_ELEMENT_TYPES { + + typeEvaluators := ELEMENT_TYPE_TO_TYPE_EVALUATORS[elementType] + + for _, evaluator := range typeEvaluators { + + if evaluator(textContainsInsensitive, dtClassNameHasSuffix) { + tagType = elementType + tagTypeDetermined = true + break + } + } + + if tagTypeDetermined { + break + } + } + + if tagType == NotFound { + log.Warn("Warning: could not determine type", "text", text, "dtClassName", dtClassName) + continue + } + + name := scrape.Text(tag) + path := scrape.Attr(tag, "href") + + entryHandler(IndexEntry{name: name, elementType: tagType, path: path}) + + indexed++ + } + + log.Info("Indexed", "count", indexed) +} + +func NewTypeEvaluators(a TypeEvaluator, others ...TypeEvaluator) []TypeEvaluator { + + typeEvaluators := make([]TypeEvaluator, 1 + len(others)) + + typeEvaluators[0] = a + for i, typeEvaluator := range others { + typeEvaluators[i + 1] = typeEvaluator + } + + return typeEvaluators +} + +func isClass(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("class in") || textContainsInsensitive("- class") || dtClassNameHasSuffix("class") +} + +func isStaticMethod(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("static method in") || dtClassNameHasSuffix("method") +} + +func isStaticField(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("static variable in") || textContainsInsensitive("field in") || dtClassNameHasSuffix("field") +} + +func isConstructor(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("constructor") || dtClassNameHasSuffix("constructor") +} + +func isMethod(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("method in") +} + +func isField(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("variable in") +} + +func isInterface(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("interface in") || textContainsInsensitive("- interface") || dtClassNameHasSuffix("interface") +} + +func isException(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("exception in") || textContainsInsensitive("- exception") || dtClassNameHasSuffix("exception") +} + +func isError(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("error in") || textContainsInsensitive("- error") || dtClassNameHasSuffix("error") +} + +func isEnum(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("enum in") || textContainsInsensitive("- enum") || dtClassNameHasSuffix("enum") +} + +func isTrait(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("trait in") +} + +func isNotation(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("annotation type") || dtClassNameHasSuffix("annotation") +} + +func isPackage(textContainsInsensitive, dtClassNameHasSuffix Verifier) bool { + return textContainsInsensitive("package") || dtClassNameHasSuffix("package") +}