parent
9e26ef4202
commit
45a9729855
@ -1,3 +1,13 @@
|
||||
# marseille-perils-webcrawler
|
||||
# Environnement virtuel pour python
|
||||
python3 -m venv scrapy-env
|
||||
(unix-like) source scrapy-env/bin/activate
|
||||
(windows) tutorial-env\Scripts\activate.bat
|
||||
|
||||
Un webcrawler basé sur Scrapy pour extraire les arrêtés de péril du site de la ville de Marseille
|
||||
# Installation des dépendances
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install regex
|
||||
python -m pip install scrapy
|
||||
|
||||
# Lancement de scrapy
|
||||
cd src
|
||||
scrapy crawl perils -O perils.csv
|
||||
|
||||
Binary file not shown.
@ -0,0 +1,247 @@
|
||||
<#
|
||||
.Synopsis
|
||||
Activate a Python virtual environment for the current PowerShell session.
|
||||
|
||||
.Description
|
||||
Pushes the python executable for a virtual environment to the front of the
|
||||
$Env:PATH environment variable and sets the prompt to signify that you are
|
||||
in a Python virtual environment. Makes use of the command line switches as
|
||||
well as the `pyvenv.cfg` file values present in the virtual environment.
|
||||
|
||||
.Parameter VenvDir
|
||||
Path to the directory that contains the virtual environment to activate. The
|
||||
default value for this is the parent of the directory that the Activate.ps1
|
||||
script is located within.
|
||||
|
||||
.Parameter Prompt
|
||||
The prompt prefix to display when this virtual environment is activated. By
|
||||
default, this prompt is the name of the virtual environment folder (VenvDir)
|
||||
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
|
||||
|
||||
.Example
|
||||
Activate.ps1
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -Verbose
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||
and shows extra information about the activation as it executes.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
|
||||
Activates the Python virtual environment located in the specified location.
|
||||
|
||||
.Example
|
||||
Activate.ps1 -Prompt "MyPython"
|
||||
Activates the Python virtual environment that contains the Activate.ps1 script,
|
||||
and prefixes the current prompt with the specified string (surrounded in
|
||||
parentheses) while the virtual environment is active.
|
||||
|
||||
.Notes
|
||||
On Windows, it may be required to enable this Activate.ps1 script by setting the
|
||||
execution policy for the user. You can do this by issuing the following PowerShell
|
||||
command:
|
||||
|
||||
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
|
||||
|
||||
For more information on Execution Policies:
|
||||
https://go.microsoft.com/fwlink/?LinkID=135170
|
||||
|
||||
#>
|
||||
Param(
|
||||
[Parameter(Mandatory = $false)]
|
||||
[String]
|
||||
$VenvDir,
|
||||
[Parameter(Mandatory = $false)]
|
||||
[String]
|
||||
$Prompt
|
||||
)
|
||||
|
||||
<# Function declarations --------------------------------------------------- #>
|
||||
|
||||
<#
|
||||
.Synopsis
|
||||
Remove all shell session elements added by the Activate script, including the
|
||||
addition of the virtual environment's Python executable from the beginning of
|
||||
the PATH variable.
|
||||
|
||||
.Parameter NonDestructive
|
||||
If present, do not remove this function from the global namespace for the
|
||||
session.
|
||||
|
||||
#>
|
||||
function global:deactivate ([switch]$NonDestructive) {
|
||||
# Revert to original values
|
||||
|
||||
# The prior prompt:
|
||||
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
|
||||
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
|
||||
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
|
||||
}
|
||||
|
||||
# The prior PYTHONHOME:
|
||||
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
|
||||
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
|
||||
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
|
||||
}
|
||||
|
||||
# The prior PATH:
|
||||
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
|
||||
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
|
||||
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
|
||||
}
|
||||
|
||||
# Just remove the VIRTUAL_ENV altogether:
|
||||
if (Test-Path -Path Env:VIRTUAL_ENV) {
|
||||
Remove-Item -Path env:VIRTUAL_ENV
|
||||
}
|
||||
|
||||
# Just remove VIRTUAL_ENV_PROMPT altogether.
|
||||
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
|
||||
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
|
||||
}
|
||||
|
||||
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
|
||||
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
|
||||
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
|
||||
}
|
||||
|
||||
# Leave deactivate function in the global namespace if requested:
|
||||
if (-not $NonDestructive) {
|
||||
Remove-Item -Path function:deactivate
|
||||
}
|
||||
}
|
||||
|
||||
<#
|
||||
.Description
|
||||
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
|
||||
given folder, and returns them in a map.
|
||||
|
||||
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
|
||||
two strings separated by `=` (with any amount of whitespace surrounding the =)
|
||||
then it is considered a `key = value` line. The left hand string is the key,
|
||||
the right hand is the value.
|
||||
|
||||
If the value starts with a `'` or a `"` then the first and last character is
|
||||
stripped from the value before being captured.
|
||||
|
||||
.Parameter ConfigDir
|
||||
Path to the directory that contains the `pyvenv.cfg` file.
|
||||
#>
|
||||
function Get-PyVenvConfig(
|
||||
[String]
|
||||
$ConfigDir
|
||||
) {
|
||||
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
|
||||
|
||||
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
|
||||
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
|
||||
|
||||
# An empty map will be returned if no config file is found.
|
||||
$pyvenvConfig = @{ }
|
||||
|
||||
if ($pyvenvConfigPath) {
|
||||
|
||||
Write-Verbose "File exists, parse `key = value` lines"
|
||||
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
|
||||
|
||||
$pyvenvConfigContent | ForEach-Object {
|
||||
$keyval = $PSItem -split "\s*=\s*", 2
|
||||
if ($keyval[0] -and $keyval[1]) {
|
||||
$val = $keyval[1]
|
||||
|
||||
# Remove extraneous quotations around a string value.
|
||||
if ("'""".Contains($val.Substring(0, 1))) {
|
||||
$val = $val.Substring(1, $val.Length - 2)
|
||||
}
|
||||
|
||||
$pyvenvConfig[$keyval[0]] = $val
|
||||
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
|
||||
}
|
||||
}
|
||||
}
|
||||
return $pyvenvConfig
|
||||
}
|
||||
|
||||
|
||||
<# Begin Activate script --------------------------------------------------- #>
|
||||
|
||||
# Determine the containing directory of this script
|
||||
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$VenvExecDir = Get-Item -Path $VenvExecPath
|
||||
|
||||
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
|
||||
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
|
||||
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
|
||||
|
||||
# Set values required in priority: CmdLine, ConfigFile, Default
|
||||
# First, get the location of the virtual environment, it might not be
|
||||
# VenvExecDir if specified on the command line.
|
||||
if ($VenvDir) {
|
||||
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
|
||||
}
|
||||
else {
|
||||
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
|
||||
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
|
||||
Write-Verbose "VenvDir=$VenvDir"
|
||||
}
|
||||
|
||||
# Next, read the `pyvenv.cfg` file to determine any required value such
|
||||
# as `prompt`.
|
||||
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
|
||||
|
||||
# Next, set the prompt from the command line, or the config file, or
|
||||
# just use the name of the virtual environment folder.
|
||||
if ($Prompt) {
|
||||
Write-Verbose "Prompt specified as argument, using '$Prompt'"
|
||||
}
|
||||
else {
|
||||
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
|
||||
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
|
||||
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
|
||||
$Prompt = $pyvenvCfg['prompt'];
|
||||
}
|
||||
else {
|
||||
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
|
||||
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
|
||||
$Prompt = Split-Path -Path $venvDir -Leaf
|
||||
}
|
||||
}
|
||||
|
||||
Write-Verbose "Prompt = '$Prompt'"
|
||||
Write-Verbose "VenvDir='$VenvDir'"
|
||||
|
||||
# Deactivate any currently active virtual environment, but leave the
|
||||
# deactivate function in place.
|
||||
deactivate -nondestructive
|
||||
|
||||
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
|
||||
# that there is an activated venv.
|
||||
$env:VIRTUAL_ENV = $VenvDir
|
||||
|
||||
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
|
||||
|
||||
Write-Verbose "Setting prompt to '$Prompt'"
|
||||
|
||||
# Set the prompt to include the env name
|
||||
# Make sure _OLD_VIRTUAL_PROMPT is global
|
||||
function global:_OLD_VIRTUAL_PROMPT { "" }
|
||||
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
|
||||
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
|
||||
|
||||
function global:prompt {
|
||||
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
|
||||
_OLD_VIRTUAL_PROMPT
|
||||
}
|
||||
$env:VIRTUAL_ENV_PROMPT = $Prompt
|
||||
}
|
||||
|
||||
# Clear PYTHONHOME
|
||||
if (Test-Path -Path Env:PYTHONHOME) {
|
||||
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
|
||||
Remove-Item -Path Env:PYTHONHOME
|
||||
}
|
||||
|
||||
# Add the venv to the PATH
|
||||
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
|
||||
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"
|
||||
@ -0,0 +1,69 @@
|
||||
# This file must be used with "source bin/activate" *from bash*
|
||||
# you cannot run it directly
|
||||
|
||||
deactivate () {
|
||||
# reset old environment variables
|
||||
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
|
||||
PATH="${_OLD_VIRTUAL_PATH:-}"
|
||||
export PATH
|
||||
unset _OLD_VIRTUAL_PATH
|
||||
fi
|
||||
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
|
||||
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
|
||||
export PYTHONHOME
|
||||
unset _OLD_VIRTUAL_PYTHONHOME
|
||||
fi
|
||||
|
||||
# This should detect bash and zsh, which have a hash command that must
|
||||
# be called to get it to forget past commands. Without forgetting
|
||||
# past commands the $PATH changes we made may not be respected
|
||||
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
|
||||
hash -r 2> /dev/null
|
||||
fi
|
||||
|
||||
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
|
||||
PS1="${_OLD_VIRTUAL_PS1:-}"
|
||||
export PS1
|
||||
unset _OLD_VIRTUAL_PS1
|
||||
fi
|
||||
|
||||
unset VIRTUAL_ENV
|
||||
unset VIRTUAL_ENV_PROMPT
|
||||
if [ ! "${1:-}" = "nondestructive" ] ; then
|
||||
# Self destruct!
|
||||
unset -f deactivate
|
||||
fi
|
||||
}
|
||||
|
||||
# unset irrelevant variables
|
||||
deactivate nondestructive
|
||||
|
||||
VIRTUAL_ENV="/home/p/Documents/Mapping/scraping/perils/scrapy-env"
|
||||
export VIRTUAL_ENV
|
||||
|
||||
_OLD_VIRTUAL_PATH="$PATH"
|
||||
PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
export PATH
|
||||
|
||||
# unset PYTHONHOME if set
|
||||
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
|
||||
# could use `if (set -u; : $PYTHONHOME) ;` in bash
|
||||
if [ -n "${PYTHONHOME:-}" ] ; then
|
||||
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
|
||||
unset PYTHONHOME
|
||||
fi
|
||||
|
||||
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
|
||||
_OLD_VIRTUAL_PS1="${PS1:-}"
|
||||
PS1="(scrapy-env) ${PS1:-}"
|
||||
export PS1
|
||||
VIRTUAL_ENV_PROMPT="(scrapy-env) "
|
||||
export VIRTUAL_ENV_PROMPT
|
||||
fi
|
||||
|
||||
# This should detect bash and zsh, which have a hash command that must
|
||||
# be called to get it to forget past commands. Without forgetting
|
||||
# past commands the $PATH changes we made may not be respected
|
||||
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
|
||||
hash -r 2> /dev/null
|
||||
fi
|
||||
@ -0,0 +1,26 @@
|
||||
# This file must be used with "source bin/activate.csh" *from csh*.
|
||||
# You cannot run it directly.
|
||||
# Created by Davide Di Blasi <davidedb@gmail.com>.
|
||||
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
|
||||
|
||||
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
|
||||
|
||||
# Unset irrelevant variables.
|
||||
deactivate nondestructive
|
||||
|
||||
setenv VIRTUAL_ENV "/home/p/Documents/Mapping/scraping/perils/scrapy-env"
|
||||
|
||||
set _OLD_VIRTUAL_PATH="$PATH"
|
||||
setenv PATH "$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
|
||||
set _OLD_VIRTUAL_PROMPT="$prompt"
|
||||
|
||||
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
|
||||
set prompt = "(scrapy-env) $prompt"
|
||||
setenv VIRTUAL_ENV_PROMPT "(scrapy-env) "
|
||||
endif
|
||||
|
||||
alias pydoc python -m pydoc
|
||||
|
||||
rehash
|
||||
@ -0,0 +1,66 @@
|
||||
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
|
||||
# (https://fishshell.com/); you cannot run it directly.
|
||||
|
||||
function deactivate -d "Exit virtual environment and return to normal shell environment"
|
||||
# reset old environment variables
|
||||
if test -n "$_OLD_VIRTUAL_PATH"
|
||||
set -gx PATH $_OLD_VIRTUAL_PATH
|
||||
set -e _OLD_VIRTUAL_PATH
|
||||
end
|
||||
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
|
||||
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
|
||||
set -e _OLD_VIRTUAL_PYTHONHOME
|
||||
end
|
||||
|
||||
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
|
||||
functions -e fish_prompt
|
||||
set -e _OLD_FISH_PROMPT_OVERRIDE
|
||||
functions -c _old_fish_prompt fish_prompt
|
||||
functions -e _old_fish_prompt
|
||||
end
|
||||
|
||||
set -e VIRTUAL_ENV
|
||||
set -e VIRTUAL_ENV_PROMPT
|
||||
if test "$argv[1]" != "nondestructive"
|
||||
# Self-destruct!
|
||||
functions -e deactivate
|
||||
end
|
||||
end
|
||||
|
||||
# Unset irrelevant variables.
|
||||
deactivate nondestructive
|
||||
|
||||
set -gx VIRTUAL_ENV "/home/p/Documents/Mapping/scraping/perils/scrapy-env"
|
||||
|
||||
set -gx _OLD_VIRTUAL_PATH $PATH
|
||||
set -gx PATH "$VIRTUAL_ENV/bin" $PATH
|
||||
|
||||
# Unset PYTHONHOME if set.
|
||||
if set -q PYTHONHOME
|
||||
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
|
||||
set -e PYTHONHOME
|
||||
end
|
||||
|
||||
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
|
||||
# fish uses a function instead of an env var to generate the prompt.
|
||||
|
||||
# Save the current fish_prompt function as the function _old_fish_prompt.
|
||||
functions -c fish_prompt _old_fish_prompt
|
||||
|
||||
# With the original prompt function renamed, we can override with our own.
|
||||
function fish_prompt
|
||||
# Save the return status of the last command.
|
||||
set -l old_status $status
|
||||
|
||||
# Output the venv prompt; color taken from the blue of the Python logo.
|
||||
printf "%s%s%s" (set_color 4B8BBE) "(scrapy-env) " (set_color normal)
|
||||
|
||||
# Restore the return status of the previous command.
|
||||
echo "exit $old_status" | .
|
||||
# Output the original/"old" prompt.
|
||||
_old_fish_prompt
|
||||
end
|
||||
|
||||
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
|
||||
set -gx VIRTUAL_ENV_PROMPT "(scrapy-env) "
|
||||
end
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from automat._visualize import tool
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(tool())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.conch.scripts.cftp import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.conch.scripts.ckeygen import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.conch.scripts.conch import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
@ -0,0 +1,54 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from pprint import pformat
|
||||
|
||||
import jmespath
|
||||
from jmespath import exceptions
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('expression')
|
||||
parser.add_argument('-f', '--filename',
|
||||
help=('The filename containing the input data. '
|
||||
'If a filename is not given then data is '
|
||||
'read from stdin.'))
|
||||
parser.add_argument('--ast', action='store_true',
|
||||
help=('Pretty print the AST, do not search the data.'))
|
||||
args = parser.parse_args()
|
||||
expression = args.expression
|
||||
if args.ast:
|
||||
# Only print the AST
|
||||
expression = jmespath.compile(args.expression)
|
||||
sys.stdout.write(pformat(expression.parsed))
|
||||
sys.stdout.write('\n')
|
||||
return 0
|
||||
if args.filename:
|
||||
with open(args.filename, 'r') as f:
|
||||
data = json.load(f)
|
||||
else:
|
||||
data = sys.stdin.read()
|
||||
data = json.loads(data)
|
||||
try:
|
||||
sys.stdout.write(json.dumps(
|
||||
jmespath.search(expression, data), indent=4, ensure_ascii=False))
|
||||
sys.stdout.write('\n')
|
||||
except exceptions.ArityError as e:
|
||||
sys.stderr.write("invalid-arity: %s\n" % e)
|
||||
return 1
|
||||
except exceptions.JMESPathTypeError as e:
|
||||
sys.stderr.write("invalid-type: %s\n" % e)
|
||||
return 1
|
||||
except exceptions.UnknownFunctionError as e:
|
||||
sys.stderr.write("unknown-function: %s\n" % e)
|
||||
return 1
|
||||
except exceptions.ParseError as e:
|
||||
sys.stderr.write("syntax-error: %s\n" % e)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.mail.scripts.mailmail import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from charset_normalizer.cli.normalizer import cli_detect
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(cli_detect())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from pip._internal.cli.main import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.scripts.htmlizer import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
@ -0,0 +1 @@
|
||||
python3
|
||||
@ -0,0 +1 @@
|
||||
/usr/bin/python3
|
||||
@ -0,0 +1 @@
|
||||
python3
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from scrapy.cmdline import execute
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(execute())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.conch.scripts.tkconch import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from tldextract.cli import main
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(main())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.scripts.trial import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.application.twist._twist import Twist
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(Twist.main())
|
||||
@ -0,0 +1,8 @@
|
||||
#!/home/p/Documents/Mapping/scraping/perils/scrapy-env/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import sys
|
||||
from twisted.scripts.twistd import run
|
||||
if __name__ == '__main__':
|
||||
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
|
||||
sys.exit(run())
|
||||
@ -0,0 +1 @@
|
||||
lib
|
||||
@ -0,0 +1,3 @@
|
||||
home = /usr/bin
|
||||
include-system-site-packages = false
|
||||
version = 3.10.8
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,33 @@
|
||||
import scrapy
|
||||
# import regex
|
||||
# from scrapy.loader import ItemLoader
|
||||
# from itemloaders.processors import TakeFirst, MapCompose
|
||||
# from w3lib.html import remove_tags
|
||||
|
||||
# # Une fonction qui sépare les adresses en fonction des noms de rue
|
||||
# def splitStreet(value):
|
||||
# # Le regex qui évalue là où il faut séparer la chaîne de caractère
|
||||
# # (présence de "et / - + ainsi que"
|
||||
# # sans que ce séparateur ne soit placé près d'un groupe de numéros de la même rue)
|
||||
# expr = r"((?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\/|((?<!(\d))\s\bet)|(ainsi\sque)|((?<!(\d))\s-\s(?!(bt)|(bis)|(ter)))|(?<!(\d\sa)|(\d\sb)|(\d\sbis)|(\d\ster)|(\d)|(\da)|(\db)|(\dbis)|(\dter)|(\d\s)|(\da\s)|(\db\s)|(\dbis\s)|(\dter\s)|(\d\sa\s)|(\d\sb\s)|(\d\sbis\s)|(\d\ster\s))\+)"
|
||||
# # Remplacement de chaque séparateur par un pipe ("|")
|
||||
# subst = "|"
|
||||
# repl = regex.sub(expr, subst, value, 0, regex.MULTILINE | regex.IGNORECASE)
|
||||
# # Formattage (supression des espaces insécables et des ":")
|
||||
# filtered = regex.sub('\\xa0|:', '', repl, 0, regex.MULTILINE | regex.IGNORECASE)
|
||||
# # Séparation en liste
|
||||
# splitted = regex.split('\|', filtered, regex.MULTILINE | regex.IGNORECASE)
|
||||
# # Suppression des espaces en début et fin de chaîne
|
||||
# stripped = [x.strip() for x in splitted]
|
||||
# # Suppression des "None" de la liste (failsafe, ne devrait pas être nécessaire)
|
||||
# result = list(filter(None, stripped))
|
||||
# return result
|
||||
|
||||
|
||||
|
||||
class PerilsItem(scrapy.Item):
|
||||
adrs = scrapy.Field()
|
||||
dernierA = scrapy.Field()
|
||||
As = scrapy.Field()
|
||||
raw = scrapy.Field()
|
||||
pass
|
||||
@ -0,0 +1,13 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
|
||||
class PerilsPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
@ -0,0 +1,93 @@
|
||||
# Scrapy settings for perils project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'perils'
|
||||
|
||||
SPIDER_MODULES = ['perils.spiders']
|
||||
NEWSPIDER_MODULE = 'perils.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'perils (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
SPIDER_MIDDLEWARES = {
|
||||
# 'perils.middlewares.PerilsSpiderMiddleware': 543,
|
||||
'perils.splittermidware.SplitAndSort': 543,
|
||||
}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'perils.middlewares.PerilsDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# 'perils.pipelines.PerilsPipeline': 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
|
||||
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
|
||||
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
@ -0,0 +1,30 @@
|
||||
import scrapy
|
||||
from perils.items import PerilsItem
|
||||
# from scrapy.loader import ItemLoader
|
||||
|
||||
class ScrapePerils(scrapy.Spider):
|
||||
name = "perils"
|
||||
start_urls = ["https://www.marseille.fr/logement-urbanisme/am%C3%A9lioration-de-lhabitat/arretes-de-peril"]
|
||||
|
||||
def parse(self, response):
|
||||
for adresses in response.xpath('//div[@class="card"]//li|//div[@class="card"]//li/p'):
|
||||
item = PerilsItem()
|
||||
# l = ItemLoader(item = PerilsItem(), selector=adresses)
|
||||
|
||||
# l.add_xpath('adrs', './text()')
|
||||
# l.add_xpath('dernierA', './a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()')
|
||||
# if adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get() is None:
|
||||
# l.add_xpath('dernierA', './a[last()]/text()')
|
||||
|
||||
item['adrs'] = adresses.xpath('./text()').get(),
|
||||
item['dernierA'] = adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get(),
|
||||
if adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get() is None:
|
||||
item['dernierA'] = adresses.xpath('./a[last()]/text()').get(),
|
||||
item['As'] = adresses.xpath('./a/text()').getall()
|
||||
item['raw'] = adresses.xpath('.').get()
|
||||
|
||||
yield item
|
||||
|
||||
|
||||
|
||||
#response.xpath('//div[@class="card"]//li/text()[1]|//div[@class="card"]//li/p/text()[1]').getall()
|
||||
@ -0,0 +1,13 @@
|
||||
import scrapy
|
||||
|
||||
class ScrapePerils(scrapy.Spider):
|
||||
name = "perils"
|
||||
start_urls = ["https://www.marseille.fr/logement-urbanisme/am%C3%A9lioration-de-lhabitat/arretes-de-peril"]
|
||||
|
||||
def parse(self, response):
|
||||
for adresses in response.xpath('//div[@class="card"]//li'):
|
||||
yield {
|
||||
'adresse': adresses.xpath('./text()').get(),
|
||||
'dernier arrêté hors modificatif' : adresses.xpath('./a[not(contains(translate(.,"MODIF","modif"),"modif"))][last()]/text()').get(),
|
||||
'dernier arrêté' : adresses.xpath('./a[last()]/text()').get(),
|
||||
}
|
||||
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = perils.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = perils
|
||||
Loading…
Reference in new issue